import pandas as pd
import numpy as np
from gensim.models import word2vec
from sklearn.manifold import TSNE
import plotly.express as px
from topicmodel import TopicModel
OHCO = ['book_id','chap_id','para_num','sent_num','token_num']
BOW = pd.read_csv("full_BOW.csv")
BOW['term_str'] = BOW['term_str'].astype('str')
BOW = BOW.set_index(['book_id', 'chap_id', 'term_str'])
LIB = pd.read_csv(("full_LIB.csv"), index_col = ['book_id'])
CORPUS = pd.read_csv(("full_CORPUS.csv"), index_col = OHCO)
VOCAB = pd.read_csv("full_VOCAB.csv")
VOCAB['term_str'] = VOCAB['term_str'].astype('str')
VOCAB = VOCAB.set_index('term_str')
VOCAB['pos_group'] = VOCAB.max_pos.str.slice(0,2)
VOCAB.head()
| term_rank | n | n_chars | p | i | max_pos | n_pos | cat_pos | stop | stem_porter | ... | stem_lancaster | term_rank2 | zipf_k | zipf_k2 | tfidf_mean_chap_max | tfidf_max_chap_max | df | idf | dfidf | pos_group | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| term_str | |||||||||||||||||||||
| the | 1 | 418963 | 3 | 0.052764 | 4.244302 | DT | 22 | {'PRP', 'FW', 'RB', 'NN', 'JJS', 'NNP', 'VBZ',... | 1 | the | ... | the | 1 | 418963 | 418963 | 0.001204 | 0.001261 | 2288 | 0.001261 | 2.884130 | DT |
| and | 2 | 310105 | 3 | 0.039054 | 4.678368 | CC | 20 | {'PRP', 'FW', 'RB', 'PDT', 'NN', 'NNP', 'VBZ',... | 1 | and | ... | and | 2 | 620210 | 620210 | 0.001852 | 0.002522 | 2286 | 0.002522 | 5.765737 | CC |
| of | 3 | 218996 | 2 | 0.027580 | 5.180221 | IN | 19 | {'PRP', 'FW', 'RB', 'PDT', 'NN', 'NNP', 'VBZ',... | 1 | of | ... | of | 3 | 656988 | 656988 | 0.000965 | 0.001891 | 2287 | 0.001891 | 4.325249 | IN |
| to | 4 | 206700 | 2 | 0.026032 | 5.263587 | TO | 23 | {'WDT', 'FW', 'RB', 'PDT', 'NN', 'NNP', 'VBZ',... | 1 | to | ... | to | 4 | 826800 | 826800 | 0.000924 | 0.001891 | 2287 | 0.001891 | 4.325249 | TO |
| a | 5 | 189310 | 1 | 0.023842 | 5.390375 | DT | 21 | {'RBR', 'PRP', 'FW', 'RB', 'NN', 'NNP', 'VBZ',... | 1 | a | ... | a | 5 | 946550 | 946550 | 0.001707 | 0.003785 | 2284 | 0.003785 | 8.644820 | DT |
5 rows × 21 columns
BOW.head()
| n | tf | tfidf | |||
|---|---|---|---|---|---|
| book_id | chap_id | term_str | |||
| 70 | 1 | 1835 | 1 | 0.142857 | 1.262743 |
| 1910 | 1 | 0.142857 | 1.225167 | ||
| a | 2 | 0.285714 | 0.001081 | ||
| alphabet | 1 | 0.142857 | 0.873820 | ||
| as | 2 | 0.285714 | 0.007080 |
LIB.head()
| source_file_path | title | chap_regex | author | type | year | decade | n_chaps | book_len | label | |
|---|---|---|---|---|---|---|---|---|---|---|
| book_id | ||||||||||
| 70 | Twain/70-what_is_man.txt | what is man | WHAT IS MAN?|THE DEATH OF JEAN|THE TURNING-POI... | twain | non-fiction | 1906 | 1900 | 17 | 96111 | twain 70: what is man |
| 74 | Twain/74-the_adventures_of_tom_sawyer.txt | the adventures of tom sawyer | ^\s*CHAPTER\s*[IVXLCM]+$ | twain | novel | 1876 | 1870 | 35 | 70276 | twain 74: the adventures of tom sawyer |
| 76 | Twain/76-the_adventures_of_huckleberry_finn.txt | the adventures of huckleberry finn | ^\s*CHAPTER\s*(?:[IVXLCM]+\.|THE LAST)$ | twain | novel | 1884 | 1880 | 43 | 111908 | twain 76: the adventures of huckleberry finn |
| 86 | Twain/86-a_connecticut_yankee_in_king_arthurs_... | a connecticut yankee in king arthurs court | ^\s*(?:PREFACE|A WORD OF EXPLANATION|THE STRAN... | twain | novel | 1889 | 1880 | 47 | 119100 | twain 86: a connecticut yankee in king arthurs... |
| 91 | Twain/91-tom_sawyer_abroad.txt | tom sawyer abroad | CHAPTER\s[IVXLCM]+\. | twain | novel | 1894 | 1890 | 13 | 33969 | twain 91: tom sawyer abroad |
# join BOW and VOCAB
joint_BOW = BOW.reset_index().set_index('term_str').join(VOCAB, rsuffix = "_vocab")
# remove nan
joint_BOW = joint_BOW.loc[~joint_BOW.isna().any(axis = 1)]
# remove proper nouns
joint_BOW = joint_BOW.loc[~joint_BOW.max_pos.isin(['NNP', 'NNPS'])]
joint_BOW
| book_id | chap_id | n | tf | tfidf | term_rank | n_vocab | n_chars | p | i | ... | stem_lancaster | term_rank2 | zipf_k | zipf_k2 | tfidf_mean_chap_max | tfidf_max_chap_max | df | idf | dfidf | pos_group | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| term_str | |||||||||||||||||||||
| 0 | 588 | 7 | 2 | 0.040000 | 0.326445 | 7549 | 65 | 1 | 8.186068e-06 | 16.898398 | ... | 0 | 1435 | 490685 | 93275 | 0.278472 | 1.492147 | 8 | 8.161132 | 65.289055 | CD |
| 0 | 786 | 16 | 1 | 0.012987 | 0.105989 | 7549 | 65 | 1 | 8.186068e-06 | 16.898398 | ... | 0 | 1435 | 490685 | 93275 | 0.278472 | 1.492147 | 8 | 8.161132 | 65.289055 | CD |
| 0 | 882 | 47 | 1 | 0.001244 | 0.010151 | 7549 | 65 | 1 | 8.186068e-06 | 16.898398 | ... | 0 | 1435 | 490685 | 93275 | 0.278472 | 1.492147 | 8 | 8.161132 | 65.289055 | CD |
| 0 | 912 | 3 | 3 | 0.005714 | 0.046635 | 7549 | 65 | 1 | 8.186068e-06 | 16.898398 | ... | 0 | 1435 | 490685 | 93275 | 0.278472 | 1.492147 | 8 | 8.161132 | 65.289055 | CD |
| 0 | 1414 | 1 | 49 | 0.182836 | 1.492147 | 7549 | 65 | 1 | 8.186068e-06 | 16.898398 | ... | 0 | 1435 | 490685 | 93275 | 0.278472 | 1.492147 | 8 | 8.161132 | 65.289055 | CD |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| étouffante | 60900 | 5 | 1 | 0.007752 | 0.086520 | 50882 | 1 | 10 | 1.259395e-07 | 22.920766 | ... | étouff | 1499 | 50882 | 1499 | 0.086520 | 0.086520 | 1 | 11.161132 | 11.161132 | NN |
| évitant | 3189 | 3 | 1 | 0.004132 | 0.046120 | 50885 | 1 | 7 | 1.259395e-07 | 22.920766 | ... | évit | 1499 | 50885 | 1499 | 0.046120 | 0.046120 | 1 | 11.161132 | 11.161132 | VB |
| êtes | 3189 | 3 | 1 | 0.004132 | 0.046120 | 50890 | 1 | 4 | 1.259395e-07 | 22.920766 | ... | ête | 1499 | 50890 | 1499 | 0.046120 | 0.046120 | 1 | 11.161132 | 11.161132 | NN |
| öffnen | 60900 | 6 | 1 | 0.004608 | 0.051434 | 50891 | 1 | 6 | 1.259395e-07 | 22.920766 | ... | öffnen | 1499 | 50891 | 1499 | 0.051434 | 0.051434 | 1 | 11.161132 | 11.161132 | NN |
| übergeschlagen | 60900 | 6 | 1 | 0.004608 | 0.051434 | 50895 | 1 | 14 | 1.259395e-07 | 22.920766 | ... | übergeschl | 1499 | 50895 | 1499 | 0.051434 | 0.051434 | 1 | 11.161132 | 11.161132 | NN |
2213701 rows × 26 columns
# recover filtered BOW --> drop cols added by VOCAB and reset index to book_id, chap_id, term_str
filtered_BOW = joint_BOW.drop(joint_BOW.loc[:, 'n_vocab':].columns, axis = 1).reset_index().set_index(['book_id', 'chap_id', 'term_str'])
# sort by book id
filtered_BOW = filtered_BOW.sort_values('book_id')
filtered_BOW
| n | tf | tfidf | term_rank | |||
|---|---|---|---|---|---|---|
| book_id | chap_id | term_str | ||||
| 70 | 17 | theological | 1 | 0.000749 | 0.005123 | 14167 |
| 3 | vague | 1 | 0.004739 | 0.015596 | 2509 | |
| 12 | article | 1 | 0.062500 | 0.181779 | 1280 | |
| 2 | miserable | 5 | 0.003658 | 0.008645 | 1053 | |
| 4 | miserable | 1 | 0.004065 | 0.009608 | 1053 | |
| ... | ... | ... | ... | ... | ... | ... |
| 62739 | 2 | buckets | 2 | 0.005556 | 0.035590 | 11706 |
| 4 | beguiled | 1 | 0.003534 | 0.018828 | 8063 | |
| number | 1 | 0.003534 | 0.006499 | 638 | ||
| afterward | 1 | 0.003534 | 0.011499 | 2069 | ||
| 2 | subheadings | 1 | 0.002778 | 0.031003 | 58356 |
2213701 rows × 4 columns
# removed ~ 3.5% of data when taking out proper nouns (singular and plural)
(BOW.shape[0] - filtered_BOW.shape[0]) / BOW.shape[0]
0.04075437915339256
n_topics = 40
n_terms = 2000
tm = TopicModel(filtered_BOW)
tm.n_topics = n_topics
tm.n_terms = n_terms
tm.create_X()
tm.get_model()
tm.describe_topics()
tm.get_model_stats()
tm.plot_topics()
# table with distribution of topics for each doc
tm.THETA
| topic_id | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| book_id | chap_id | |||||||||||||||||||||
| 70 | 1 | 0.002500 | 0.002500 | 0.002500 | 0.002500 | 0.002500 | 0.679131 | 0.002500 | 0.002500 | 0.002500 | 0.002500 | ... | 0.002500 | 0.002500 | 0.002500 | 0.002500 | 0.002500 | 0.002500 | 0.002500 | 0.002500 | 0.002500 | 0.002500 |
| 2 | 0.024762 | 0.000005 | 0.000005 | 0.000005 | 0.000005 | 0.117217 | 0.000005 | 0.000005 | 0.210831 | 0.000005 | ... | 0.000005 | 0.000005 | 0.000005 | 0.000005 | 0.000005 | 0.000005 | 0.014441 | 0.000005 | 0.000005 | 0.000005 | |
| 3 | 0.130919 | 0.000040 | 0.000040 | 0.000040 | 0.000040 | 0.000040 | 0.029978 | 0.000040 | 0.172546 | 0.000040 | ... | 0.000040 | 0.000040 | 0.000040 | 0.000040 | 0.000040 | 0.000040 | 0.000040 | 0.000040 | 0.000040 | 0.000040 | |
| 4 | 0.000045 | 0.000045 | 0.000045 | 0.005706 | 0.000045 | 0.126276 | 0.000045 | 0.054103 | 0.477511 | 0.000045 | ... | 0.005885 | 0.000045 | 0.000045 | 0.000045 | 0.000045 | 0.000045 | 0.101579 | 0.000045 | 0.000045 | 0.000045 | |
| 5 | 0.000032 | 0.000032 | 0.000032 | 0.000032 | 0.000032 | 0.000032 | 0.000032 | 0.000032 | 0.411315 | 0.000032 | ... | 0.000032 | 0.074526 | 0.000032 | 0.000032 | 0.000032 | 0.238227 | 0.000032 | 0.000032 | 0.000032 | 0.000032 | |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 62739 | 2 | 0.027685 | 0.000027 | 0.000027 | 0.000027 | 0.000027 | 0.167650 | 0.000027 | 0.204051 | 0.236223 | 0.000027 | ... | 0.000027 | 0.000027 | 0.000027 | 0.000027 | 0.023993 | 0.000027 | 0.049100 | 0.000027 | 0.059034 | 0.000027 |
| 3 | 0.068494 | 0.000269 | 0.000269 | 0.088688 | 0.000269 | 0.109124 | 0.000269 | 0.000269 | 0.542757 | 0.000269 | ... | 0.000269 | 0.000269 | 0.000269 | 0.000269 | 0.000269 | 0.000269 | 0.000269 | 0.000269 | 0.000269 | 0.000269 | |
| 4 | 0.000063 | 0.000063 | 0.000063 | 0.000063 | 0.000063 | 0.496019 | 0.000063 | 0.050351 | 0.088109 | 0.000063 | ... | 0.000063 | 0.000063 | 0.000063 | 0.000063 | 0.000063 | 0.000063 | 0.000063 | 0.000063 | 0.000063 | 0.000063 | |
| 5 | 0.000179 | 0.000179 | 0.000179 | 0.000179 | 0.000179 | 0.654181 | 0.000179 | 0.066993 | 0.000179 | 0.000179 | ... | 0.000179 | 0.000179 | 0.000179 | 0.000179 | 0.000179 | 0.000179 | 0.000179 | 0.000179 | 0.000179 | 0.000179 | |
| 6 | 0.000714 | 0.000714 | 0.000714 | 0.000714 | 0.000714 | 0.000714 | 0.000714 | 0.000714 | 0.000714 | 0.000714 | ... | 0.000714 | 0.000714 | 0.000714 | 0.000714 | 0.283106 | 0.000714 | 0.000714 | 0.000714 | 0.000714 | 0.000714 |
2290 rows × 40 columns
# distrubution of words over topics
tm.PHI
| term_str | knowing | shaking | consider | twelve | closed | cry | shoulder | dropped | garden | considered | ... | possibility | burden | nearest | settle | stuff | voyage | dwelling | lodging | spend | picturesque |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| topic_id | |||||||||||||||||||||
| 0 | 29.803706 | 2.632424 | 33.944118 | 12.519358 | 7.809385 | 37.463780 | 2.403779 | 0.025000 | 0.025000 | 21.815540 | ... | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 |
| 1 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 35.215981 | 0.025000 | 9.250676 | 0.025000 | 0.025000 | ... | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 5.577009 | 0.025000 | 11.030238 | 0.035628 | 13.556681 | 0.025000 |
| 2 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 1.708888 | 2.391782 | 1.659831 | 3.452461 | 0.025000 | 1.804375 | ... | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 1.628647 | 1.498681 | 1.655751 | 0.025000 | 2.568545 |
| 3 | 0.025000 | 0.025000 | 1.960878 | 0.025000 | 3.849825 | 3.454260 | 13.902374 | 18.202105 | 0.025000 | 0.025000 | ... | 0.025000 | 8.423743 | 0.025000 | 0.025000 | 4.509114 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 |
| 4 | 75.480954 | 84.434048 | 20.012255 | 43.281975 | 196.011303 | 167.818798 | 164.086678 | 197.575852 | 27.890037 | 4.729179 | ... | 0.025000 | 7.985600 | 43.892984 | 0.458997 | 20.901972 | 0.025000 | 8.952264 | 57.824534 | 8.271555 | 0.025000 |
| 5 | 13.626903 | 0.025000 | 74.970403 | 40.375234 | 7.369458 | 0.025000 | 18.693192 | 0.025000 | 12.460580 | 65.865511 | ... | 12.738675 | 9.423386 | 3.085421 | 1.226774 | 0.025000 | 0.025000 | 4.080600 | 0.025000 | 0.025000 | 9.975268 |
| 6 | 0.025000 | 0.025000 | 0.025000 | 8.468638 | 2.351202 | 6.540831 | 0.025000 | 0.182530 | 0.025000 | 0.025000 | ... | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 |
| 7 | 16.102627 | 1.654308 | 0.025000 | 9.536278 | 80.832588 | 90.249374 | 35.460406 | 50.682250 | 0.025000 | 35.566411 | ... | 0.025000 | 11.341476 | 22.849630 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 4.084613 | 0.025000 | 0.025000 |
| 8 | 32.223802 | 0.025000 | 71.012268 | 69.439364 | 44.537920 | 26.612234 | 44.513576 | 72.118977 | 16.216869 | 33.424302 | ... | 0.025000 | 38.943653 | 10.588882 | 25.299718 | 45.238878 | 0.720845 | 0.025000 | 0.025000 | 41.215779 | 27.541300 |
| 9 | 9.679891 | 0.025000 | 0.025000 | 7.111885 | 12.287194 | 0.874992 | 0.025000 | 12.174507 | 0.025000 | 0.025000 | ... | 0.025000 | 0.025000 | 0.025000 | 10.725158 | 0.025000 | 0.025000 | 0.025000 | 2.988248 | 0.025000 | 0.025000 |
| 10 | 0.025000 | 1.151489 | 0.025000 | 0.025000 | 14.875877 | 7.522422 | 3.942505 | 0.025000 | 0.025000 | 0.025000 | ... | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 |
| 11 | 8.343296 | 4.381569 | 0.025000 | 17.679644 | 6.297713 | 0.032241 | 0.025000 | 1.168300 | 0.025000 | 2.801206 | ... | 3.674327 | 0.180946 | 5.511116 | 0.025000 | 1.982134 | 149.772603 | 0.025000 | 1.813084 | 0.025000 | 0.025000 |
| 12 | 0.025000 | 0.025000 | 7.743518 | 17.285492 | 7.963162 | 0.025000 | 0.025000 | 0.025000 | 4.276403 | 1.656301 | ... | 0.025000 | 0.025000 | 1.157202 | 0.025000 | 0.025000 | 0.025000 | 1.720054 | 0.025000 | 0.025000 | 0.025000 |
| 13 | 21.376924 | 0.025000 | 49.878052 | 0.025000 | 0.025000 | 0.025000 | 3.160418 | 0.025000 | 0.025000 | 5.648873 | ... | 0.025000 | 1.177087 | 0.205146 | 11.441066 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 3.843233 | 0.025000 |
| 14 | 31.847859 | 0.025000 | 23.676464 | 61.351506 | 0.025000 | 15.459161 | 0.025000 | 7.704192 | 0.025000 | 46.166643 | ... | 0.025000 | 0.025000 | 0.025000 | 14.931043 | 0.025000 | 6.529766 | 0.025000 | 0.025000 | 0.025000 | 0.025000 |
| 15 | 11.187464 | 74.811755 | 0.025000 | 0.025000 | 0.025000 | 12.286402 | 66.907514 | 49.549146 | 11.365151 | 76.755038 | ... | 3.495694 | 15.304989 | 0.025000 | 16.677174 | 0.025000 | 24.415916 | 0.025000 | 7.295656 | 2.491104 | 0.025000 |
| 16 | 19.493094 | 0.025000 | 2.276214 | 1.726161 | 1.954341 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 8.792720 | ... | 0.108883 | 0.025000 | 2.380176 | 0.025000 | 0.025000 | 1.419562 | 12.063842 | 0.025000 | 0.025000 | 1.082658 |
| 17 | 1.322578 | 2.565197 | 0.025000 | 4.578351 | 1.402625 | 0.025000 | 0.025000 | 1.352698 | 0.025000 | 3.497147 | ... | 2.537371 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 1.288036 | 2.217377 | 0.025000 | 0.025000 | 0.025000 |
| 18 | 21.133021 | 13.256124 | 8.168496 | 23.891582 | 0.025000 | 34.942834 | 28.583829 | 93.570654 | 7.104560 | 3.321179 | ... | 0.025000 | 0.025000 | 3.688121 | 18.657465 | 39.919737 | 2.081739 | 0.025000 | 0.025000 | 8.157265 | 0.025000 |
| 19 | 0.025000 | 42.808918 | 1.464008 | 0.025000 | 3.513799 | 0.025000 | 8.640975 | 0.025000 | 7.003182 | 8.670020 | ... | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 10.801746 | 0.025000 |
| 20 | 10.064987 | 0.025000 | 0.025000 | 0.081036 | 90.918706 | 13.785262 | 9.915553 | 13.300951 | 11.208912 | 0.025000 | ... | 23.571790 | 12.827998 | 11.007790 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 2.294718 | 0.025000 |
| 21 | 1.142914 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 3.725885 | 10.471178 | 2.232430 | 0.025000 | 0.025000 | ... | 0.025000 | 0.025000 | 0.025000 | 2.946364 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 |
| 22 | 72.279834 | 96.609799 | 114.633582 | 32.197280 | 83.724876 | 85.895105 | 39.578045 | 37.819914 | 32.694656 | 60.842399 | ... | 18.714615 | 19.494546 | 35.353959 | 26.422515 | 4.307336 | 0.025000 | 2.709676 | 0.025000 | 11.460013 | 0.025000 |
| 23 | 5.362592 | 6.310348 | 3.166873 | 28.307653 | 13.007660 | 4.058319 | 0.025000 | 7.754784 | 0.025000 | 7.600486 | ... | 11.881461 | 4.472232 | 3.743454 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 |
| 24 | 38.194408 | 0.025000 | 54.097946 | 21.780341 | 16.705373 | 10.180615 | 3.768525 | 26.306709 | 4.351668 | 19.731678 | ... | 12.576831 | 28.030378 | 6.070825 | 27.199506 | 21.279084 | 26.678851 | 8.288394 | 0.025000 | 66.765653 | 17.114853 |
| 25 | 31.775613 | 11.047148 | 0.025000 | 12.421110 | 0.025000 | 0.346572 | 1.846999 | 0.250784 | 23.987292 | 0.025000 | ... | 0.025000 | 0.025000 | 28.431248 | 0.025000 | 17.400865 | 0.025000 | 18.229904 | 64.388861 | 0.025000 | 4.537006 |
| 26 | 0.025000 | 0.025000 | 9.625074 | 9.392131 | 0.025000 | 0.025000 | 0.025000 | 2.795149 | 1.598719 | 0.025000 | ... | 0.025000 | 0.025000 | 10.298809 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 |
| 27 | 42.020479 | 38.414494 | 12.402814 | 1.494123 | 32.853313 | 93.107579 | 21.174153 | 35.955751 | 93.719243 | 14.333302 | ... | 2.142920 | 23.762260 | 0.432294 | 0.025000 | 8.245906 | 22.778387 | 1.233422 | 0.025000 | 0.025000 | 0.025000 |
| 28 | 0.025000 | 0.025000 | 0.025000 | 85.718655 | 24.547001 | 0.025000 | 20.341226 | 29.997187 | 0.585060 | 39.655535 | ... | 11.707846 | 4.486146 | 0.025000 | 5.014257 | 32.590940 | 10.931882 | 16.543455 | 0.025000 | 8.250319 | 54.346395 |
| 29 | 217.276236 | 254.303413 | 280.261185 | 14.941531 | 29.955200 | 62.694751 | 167.677160 | 77.864945 | 212.390353 | 272.223810 | ... | 81.726759 | 20.966735 | 18.546349 | 46.774310 | 1.311540 | 0.025000 | 53.827873 | 75.810472 | 24.546784 | 0.025000 |
| 30 | 0.025000 | 0.025000 | 0.025000 | 38.662668 | 0.025000 | 30.517891 | 9.030942 | 34.186202 | 0.025000 | 0.025000 | ... | 0.025000 | 0.025000 | 5.347885 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 |
| 31 | 4.249229 | 20.218358 | 0.485445 | 39.265459 | 27.336795 | 29.613348 | 37.690124 | 2.590484 | 148.408920 | 1.468198 | ... | 19.387483 | 16.532549 | 6.021140 | 10.993086 | 0.027953 | 0.025000 | 8.258435 | 3.894618 | 9.119391 | 0.025000 |
| 32 | 27.560903 | 7.941185 | 6.711003 | 69.540270 | 15.311309 | 10.087222 | 0.025666 | 10.797605 | 63.927365 | 54.072481 | ... | 37.765860 | 0.025000 | 0.025000 | 0.893970 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 17.534676 | 16.487318 |
| 33 | 50.759404 | 169.260632 | 12.142615 | 0.025000 | 40.692670 | 0.025000 | 79.046099 | 0.025000 | 0.025000 | 21.323407 | ... | 2.853101 | 0.025000 | 10.276978 | 17.788181 | 22.996563 | 0.025000 | 1.674992 | 18.498694 | 10.235533 | 0.025000 |
| 34 | 7.418504 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 6.886808 | 0.025000 | 25.332428 | 2.766354 | 0.025000 | ... | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 4.112911 | 1.053766 | 5.124571 | 0.025000 | 0.025000 | 0.025000 |
| 35 | 0.328553 | 0.025000 | 0.025000 | 54.190975 | 25.983776 | 9.712992 | 21.011513 | 0.491267 | 132.623649 | 0.557626 | ... | 4.516384 | 0.025000 | 3.590259 | 0.025000 | 0.025000 | 0.025000 | 26.311325 | 0.025000 | 0.025000 | 114.571657 |
| 36 | 43.619225 | 12.648792 | 29.729033 | 22.771667 | 52.873042 | 34.567687 | 25.117739 | 7.247595 | 32.023591 | 2.234317 | ... | 0.025000 | 18.965900 | 9.420083 | 6.519383 | 1.866407 | 0.025000 | 37.280402 | 0.025000 | 4.961212 | 0.025000 |
| 37 | 0.025000 | 0.025000 | 6.849315 | 0.025000 | 0.025000 | 2.679873 | 0.025000 | 0.025000 | 1.922435 | 0.025000 | ... | 0.025000 | 0.025000 | 2.313422 | 5.481034 | 0.025000 | 0.025000 | 19.500115 | 0.025000 | 4.919339 | 0.025000 |
| 38 | 0.025000 | 0.025000 | 19.363443 | 95.155163 | 0.025000 | 0.025000 | 0.025000 | 16.841468 | 0.025000 | 21.978784 | ... | 0.025000 | 7.130375 | 5.386828 | 0.025000 | 17.156650 | 0.025000 | 8.954380 | 11.009842 | 0.025000 | 0.025000 |
| 39 | 0.025000 | 0.025000 | 0.025000 | 3.559470 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 12.163531 | ... | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 |
40 rows × 2000 columns
tm.TOPIC.sort_values('theta_sum', ascending = False)
| phi_sum | theta_sum | h | top_terms_rel | top_terms | label | |
|---|---|---|---|---|---|---|
| topic_id | ||||||
| 29 | 138476.137464 | 220.112446 | 10.22 | guardian cousin assure sister confidence pursu... | sister understand observed guardian confidence... | 29: guardian cousin assure sister confidence p... |
| 8 | 89131.260172 | 217.495613 | 9.98 | detail color doesnt details recognized honor r... | presently toward ones isnt everybody war able ... | 8: detail color doesnt details recognized hono... |
| 5 | 56343.692496 | 150.168832 | 9.78 | institution science political class national s... | society human law character knowledge class ch... | 5: institution science political class nationa... |
| 28 | 48624.881840 | 118.187148 | 9.79 | lake mountain valley mountains ice rock miles ... | miles mountain land lake rock mountains distan... | 28: lake mountain valley mountains ice rock mi... |
| 4 | 62850.031662 | 113.004098 | 9.81 | lock alarm lamp darkness dread horror muttered... | figure slowly wind answered sound lips breast ... | 4: lock alarm lamp darkness dread horror mutte... |
| 22 | 72011.366796 | 111.941402 | 10.07 | rejoined inquired interposed hastily gentleman... | inquired rejoined exclaimed countenance servan... | 22: rejoined inquired interposed hastily gentl... |
| 24 | 50656.218241 | 105.019908 | 9.73 | lecture 3 wrote literary 2 author letters mach... | wrote letters write written story paper writin... | 24: lecture 3 wrote literary 2 author letters ... |
| 18 | 34475.634392 | 101.499568 | 9.25 | reckon warnt nigger bet maybe theyre anyway ju... | warnt reckon hes minute hadnt everybody big ma... | 18: reckon warnt nigger bet maybe theyre anywa... |
| 32 | 50898.403590 | 97.576213 | 9.85 | theatre audience dancing ball stout applause c... | party everybody wine glass stage appearance bl... | 32: theatre audience dancing ball stout applau... |
| 35 | 41938.713871 | 85.799296 | 9.61 | marble centuries pictures ancient picturesque ... | stone picture walls ancient sea houses streets... | 35: marble centuries pictures ancient pictures... |
| 36 | 45127.533127 | 82.742349 | 9.75 | merry charity sisters sorrow mercy brothers ne... | spirit merry bear brothers tears earth thought... | 36: merry charity sisters sorrow mercy brother... |
| 33 | 41170.236899 | 69.274817 | 9.54 | ha ant eh youre hes jolly em havent retorted | ha youre hes em youll whats office eh pleasant | 33: ha ant eh youre hes jolly em havent retorted |
| 7 | 29074.794954 | 64.261290 | 9.31 | soldiers sword military rode armed crowd offic... | crowd soldiers horse military sword force guar... | 7: soldiers sword military rode armed crowd of... |
| 31 | 34759.348239 | 56.914861 | 9.45 | travelling wheels horses landlord lamps roads ... | road horses horse carriage wind weather trees ... | 31: travelling wheels horses landlord lamps ro... |
| 14 | 28395.221946 | 56.375642 | 9.46 | council lords army french castle fought killed... | french sent army died war castle killed afterw... | 14: council lords army french castle fought ki... |
| 0 | 20027.842032 | 55.893150 | 8.94 | maid count lie forever voices grace hearts beg... | lie ah none toward tears truth noble saying vo... | 0: maid count lie forever voices grace hearts ... |
| 20 | 29544.283556 | 55.121146 | 9.67 | dwarf grandfather beneath childs sleeping drea... | dwarf grandfather strange sleep silence led no... | 20: dwarf grandfather beneath childs sleeping ... |
| 15 | 32163.255500 | 54.755932 | 9.43 | bottle parlour shop lad leg wooden instrument ... | shop bottle parlour tea leg em wooden sister coat | 15: bottle parlour shop lad leg wooden instrum... |
| 27 | 29961.847416 | 53.136875 | 9.49 | aunt loved loving nurse mama kiss darling forg... | aunt loved tears sweet sitting quiet loving da... | 27: aunt loved loving nurse mama kiss darling ... |
| 25 | 26769.288410 | 48.638219 | 9.40 | waiter shops idle police dirty market plate ho... | houses streets shop idle waiter iron windows w... | 25: waiter shops idle police dirty market plat... |
| 38 | 17024.101091 | 45.659177 | 8.48 | dollars cent wages cents per sold sell buy coal | dollars gold pay worth per sold silver cent go... | 38: dollars cent wages cents per sold sell buy... |
| 13 | 18311.256224 | 36.886177 | 8.63 | returns sits begins puts cries finds takes tel... | comes looks goes takes returns makes knows cri... | 13: returns sits begins puts cries finds takes... |
| 23 | 15332.440038 | 34.452292 | 8.52 | jury prisoner murder trial evidence judge witn... | court judge prisoner murder jury trial evidenc... | 23: jury prisoner murder trial evidence judge ... |
| 9 | 13835.357462 | 22.391066 | 8.68 | maam smart boots gloves aged stranger faced ba... | maam stranger boots em inquired pocket clerk o... | 9: maam smart boots gloves aged stranger faced... |
| 11 | 12258.544953 | 21.998367 | 8.14 | om ym pilot deck ship voyage ships passengers ... | ship sea om ym deck board pilot passengers ships | 11: om ym pilot deck ship voyage ships passeng... |
| 30 | 10004.087665 | 21.173252 | 8.08 | boat boats tide island lion shore ashore steam... | boat boats island shore bank tide lion stream ... | 30: boat boats tide island lion shore ashore s... |
| 1 | 8929.895643 | 20.770954 | 8.58 | locksmith school baby dance girls dancing pare... | school locksmith baby pocket girls laugh daugh... | 1: locksmith school baby dance girls dancing p... |
| 26 | 8317.639096 | 19.643036 | 8.38 | railway train hotel station platform fourth dr... | train hotel station railway box line road dog ... | 26: railway train hotel station platform fourt... |
| 37 | 12154.044627 | 17.790633 | 9.47 | thy parents author thou thee madam graceful st... | thy thee thou soul ye noble youth parents bear | 37: thy parents author thou thee madam gracefu... |
| 3 | 6787.624918 | 17.416021 | 7.43 | thee thou ye thy lad punch knights rags mad | ye thee thou thy lad none art mad ah | 3: thee thou ye thy lad punch knights rags mad |
| 34 | 6427.723600 | 16.568204 | 7.98 | widow blind song baby cup husband farm sick dance | husband blind widow baby song eat sick couple ... | 34: widow blind song baby cup husband farm sic... |
| 19 | 9985.989702 | 15.111265 | 8.68 | wery wot afore fur job em fat pipe theyre | wery em wot afore job inquired youre fur hes | 19: wery wot afore fur job em fat pipe theyre |
| 21 | 5331.354009 | 14.043632 | 6.49 | en dat de nigger bet sand warnt sell camp | de en dat nigger em knows warnt sell sand | 21: en dat de nigger bet sand warnt sell camp |
| 12 | 4499.035576 | 13.524702 | 8.06 | schoolmaster game punishment play fellows mora... | schoolmaster game play school bad society fell... | 12: schoolmaster game punishment play fellows ... |
| 16 | 5192.973099 | 12.313669 | 8.45 | lawyer chambers legal waters bore rose opinion... | rose lawyer chambers bore honour waters legal ... | 16: lawyer chambers legal waters bore rose opi... |
| 6 | 3704.572278 | 11.467062 | 7.34 | papa german mama police language sentence acco... | papa german mama language police french talkin... | 6: papa german mama police language sentence a... |
| 2 | 5962.053206 | 11.438465 | 7.55 | uncle mail coach nephew driver passengers guar... | uncle coach mail driver nephew guard pause pas... | 2: uncle mail coach nephew driver passengers g... |
| 10 | 3468.842872 | 8.112984 | 6.46 | bill brown bills measure single chief womans d... | bill brown single bills measure chief speech d... | 10: bill brown bills measure single chief woma... |
| 39 | 2456.571850 | 6.332991 | 7.50 | committee honourable member noble british bill... | honourable committee noble member british bank... | 39: committee honourable member noble british ... |
| 17 | 1823.899489 | 4.987247 | 8.42 | traveller pound meal eat landlord gate paid ru... | traveller pound eat gate paid meal landlord of... | 17: traveller pound meal eat landlord gate pai... |
top_topic = tm.TOPIC.theta_sum.idxmax()
top_topic
29
tm.TOPIC.sort_values('theta_sum', ascending = False).loc[top_topic, 'top_terms_rel']
'guardian cousin assure sister confidence pursued dearest madam agreeable'
# find topic (theta) that is most frequent (highest total prob across all docs) and take that topics top 5 terms
top_five_terms = tm.TOPIC.sort_values('theta_sum', ascending = False).loc[top_topic, 'top_terms_rel'].split()[:5]
top_five_terms
['guardian', 'cousin', 'assure', 'sister', 'confidence']
# join THETA and LIB tables
joint_theta = tm.THETA.join(LIB)
# add title column to index
joint_theta = joint_theta.set_index('title', append = True)
# drop other LIB cols and get mean topic distribution for each book
book_mean_theta = joint_theta.drop(joint_theta.loc[:, 'year':].columns, axis = 1).groupby(['book_id', 'title', 'type']).mean()
book_mean_theta.style.background_gradient(axis=None)
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | |||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| book_id | title | type | ||||||||||||||||||||||||||||||||||||||||
| 70 | what is man | non-fiction | 0.009390 | 0.008849 | 0.000235 | 0.001757 | 0.007265 | 0.145150 | 0.010515 | 0.021549 | 0.293006 | 0.000235 | 0.003116 | 0.040432 | 0.024149 | 0.024241 | 0.014891 | 0.022960 | 0.001842 | 0.000235 | 0.006764 | 0.000235 | 0.002637 | 0.000279 | 0.003191 | 0.019814 | 0.117535 | 0.015059 | 0.003171 | 0.020182 | 0.043686 | 0.000235 | 0.000579 | 0.004617 | 0.015202 | 0.000235 | 0.002092 | 0.071995 | 0.019322 | 0.000235 | 0.022882 | 0.000235 |
| 74 | the adventures of tom sawyer | novel | 0.049413 | 0.025284 | 0.000110 | 0.001742 | 0.079709 | 0.005079 | 0.000252 | 0.028712 | 0.120452 | 0.000110 | 0.000307 | 0.002137 | 0.002263 | 0.002577 | 0.001623 | 0.017570 | 0.002579 | 0.000110 | 0.268991 | 0.003971 | 0.046369 | 0.000519 | 0.013640 | 0.026613 | 0.014533 | 0.001391 | 0.000475 | 0.079104 | 0.103775 | 0.000110 | 0.008729 | 0.000110 | 0.004086 | 0.009188 | 0.005469 | 0.015288 | 0.033902 | 0.015047 | 0.008400 | 0.000261 |
| 76 | the adventures of huckleberry finn | novel | 0.003389 | 0.008033 | 0.002257 | 0.002569 | 0.000090 | 0.000090 | 0.000090 | 0.009304 | 0.007345 | 0.001536 | 0.003213 | 0.002729 | 0.000090 | 0.003128 | 0.000090 | 0.005772 | 0.001120 | 0.000334 | 0.784934 | 0.000090 | 0.000530 | 0.075553 | 0.000090 | 0.008132 | 0.005066 | 0.000685 | 0.000090 | 0.004924 | 0.008482 | 0.001229 | 0.023976 | 0.004994 | 0.003645 | 0.000090 | 0.006614 | 0.010948 | 0.001513 | 0.000090 | 0.007057 | 0.000090 |
| 86 | a connecticut yankee in king arthurs court | novel | 0.070338 | 0.000363 | 0.000599 | 0.049563 | 0.035567 | 0.046094 | 0.000491 | 0.072846 | 0.379629 | 0.000097 | 0.000949 | 0.003843 | 0.000097 | 0.004621 | 0.037833 | 0.012514 | 0.000097 | 0.000097 | 0.043406 | 0.002909 | 0.007081 | 0.001874 | 0.002402 | 0.013563 | 0.014488 | 0.010667 | 0.004123 | 0.003587 | 0.068377 | 0.002801 | 0.000426 | 0.011733 | 0.003261 | 0.001030 | 0.008550 | 0.039248 | 0.008595 | 0.004099 | 0.031875 | 0.000268 |
| 91 | tom sawyer abroad | novel | 0.005992 | 0.000081 | 0.010122 | 0.000081 | 0.001582 | 0.008627 | 0.000081 | 0.008928 | 0.019350 | 0.000081 | 0.000946 | 0.013262 | 0.000081 | 0.000081 | 0.003224 | 0.006106 | 0.000081 | 0.000081 | 0.628797 | 0.000081 | 0.000081 | 0.118461 | 0.000081 | 0.000081 | 0.002753 | 0.000081 | 0.002504 | 0.000081 | 0.104533 | 0.000081 | 0.017823 | 0.019423 | 0.000081 | 0.000081 | 0.000081 | 0.016256 | 0.001804 | 0.000081 | 0.007892 | 0.000081 |
| 93 | tom sawyer detective | novel | 0.018958 | 0.000113 | 0.017373 | 0.000113 | 0.000113 | 0.000113 | 0.000113 | 0.001622 | 0.009057 | 0.003287 | 0.004375 | 0.004046 | 0.000113 | 0.003741 | 0.000113 | 0.006578 | 0.000507 | 0.000113 | 0.840064 | 0.000113 | 0.000113 | 0.000113 | 0.000113 | 0.021745 | 0.000113 | 0.003614 | 0.001548 | 0.020826 | 0.000113 | 0.000113 | 0.007530 | 0.030981 | 0.000113 | 0.000113 | 0.000113 | 0.000113 | 0.000113 | 0.000113 | 0.001653 | 0.000113 |
| 98 | a tale of two cities | novel | 0.011382 | 0.000795 | 0.021161 | 0.003003 | 0.257681 | 0.020277 | 0.000128 | 0.060585 | 0.002053 | 0.000903 | 0.000452 | 0.002264 | 0.001539 | 0.005132 | 0.032801 | 0.013588 | 0.007773 | 0.000775 | 0.000070 | 0.015169 | 0.007934 | 0.000450 | 0.004396 | 0.043527 | 0.006366 | 0.040737 | 0.002021 | 0.055540 | 0.001443 | 0.160841 | 0.005684 | 0.056553 | 0.029251 | 0.024895 | 0.018652 | 0.054822 | 0.024706 | 0.001751 | 0.000604 | 0.002298 |
| 102 | the tragedy of puddnhead wilson | novel | 0.048508 | 0.008584 | 0.008988 | 0.000097 | 0.029773 | 0.030765 | 0.000097 | 0.021389 | 0.264461 | 0.000097 | 0.005334 | 0.000097 | 0.003814 | 0.000790 | 0.004106 | 0.000097 | 0.011661 | 0.000301 | 0.059454 | 0.000097 | 0.000097 | 0.184001 | 0.010912 | 0.106063 | 0.022258 | 0.009673 | 0.004174 | 0.009261 | 0.010417 | 0.028708 | 0.003594 | 0.000097 | 0.020802 | 0.016729 | 0.005419 | 0.020885 | 0.011506 | 0.015465 | 0.020639 | 0.000792 |
| 119 | a tramp abroad | non-fiction | 0.012653 | 0.003515 | 0.000066 | 0.004471 | 0.030154 | 0.029588 | 0.022820 | 0.030376 | 0.178021 | 0.004038 | 0.000481 | 0.003026 | 0.001016 | 0.014332 | 0.015033 | 0.008353 | 0.000202 | 0.000066 | 0.027959 | 0.001459 | 0.012460 | 0.001397 | 0.004935 | 0.005907 | 0.047871 | 0.012815 | 0.027897 | 0.006466 | 0.269704 | 0.013885 | 0.006268 | 0.022028 | 0.043631 | 0.002827 | 0.004838 | 0.087093 | 0.007448 | 0.008482 | 0.026354 | 0.000066 |
| 142 | the 30000 bequest and other stories | stories | 0.047183 | 0.000579 | 0.022021 | 0.001838 | 0.001521 | 0.087963 | 0.004357 | 0.004465 | 0.289555 | 0.003611 | 0.001774 | 0.010732 | 0.003705 | 0.027697 | 0.013455 | 0.001038 | 0.002375 | 0.000128 | 0.022441 | 0.000819 | 0.001612 | 0.000565 | 0.004751 | 0.000844 | 0.142991 | 0.004663 | 0.031612 | 0.044909 | 0.027072 | 0.007030 | 0.002531 | 0.001301 | 0.002736 | 0.003749 | 0.008140 | 0.013848 | 0.018329 | 0.082098 | 0.053831 | 0.000128 |
| 245 | life on the mississippi | non-fiction | 0.006648 | 0.003169 | 0.004721 | 0.000427 | 0.026438 | 0.074778 | 0.001328 | 0.019804 | 0.154711 | 0.002396 | 0.005967 | 0.061136 | 0.002884 | 0.006595 | 0.025567 | 0.003980 | 0.001731 | 0.000111 | 0.061439 | 0.000737 | 0.004596 | 0.003572 | 0.004500 | 0.011441 | 0.043710 | 0.013959 | 0.003447 | 0.001862 | 0.206302 | 0.004076 | 0.071164 | 0.006360 | 0.008732 | 0.007430 | 0.002663 | 0.049682 | 0.016276 | 0.006675 | 0.068826 | 0.000161 |
| 564 | the mystery of edwin drood | novel | 0.003491 | 0.010043 | 0.003267 | 0.006959 | 0.147611 | 0.031131 | 0.000230 | 0.003233 | 0.008846 | 0.004293 | 0.000358 | 0.001743 | 0.001017 | 0.085361 | 0.000937 | 0.032767 | 0.025782 | 0.001344 | 0.007663 | 0.002596 | 0.003412 | 0.000266 | 0.017439 | 0.007498 | 0.010510 | 0.043414 | 0.006177 | 0.008138 | 0.001611 | 0.303523 | 0.006738 | 0.022146 | 0.032933 | 0.039216 | 0.003078 | 0.043777 | 0.068579 | 0.001219 | 0.001037 | 0.000619 |
| 580 | the pickwick papers | novel | 0.003281 | 0.009057 | 0.019654 | 0.000318 | 0.020148 | 0.024977 | 0.000328 | 0.013093 | 0.000631 | 0.072185 | 0.001253 | 0.000258 | 0.000865 | 0.002757 | 0.002643 | 0.017444 | 0.003106 | 0.000120 | 0.000251 | 0.103974 | 0.045815 | 0.000149 | 0.361552 | 0.012102 | 0.008169 | 0.018324 | 0.001288 | 0.010501 | 0.005732 | 0.029421 | 0.001582 | 0.040540 | 0.107451 | 0.021584 | 0.002277 | 0.011547 | 0.020182 | 0.000057 | 0.000878 | 0.004506 |
| 588 | master humphreys clock | stories | 0.000044 | 0.000044 | 0.001081 | 0.004862 | 0.016351 | 0.099404 | 0.000254 | 0.030531 | 0.000044 | 0.000044 | 0.001658 | 0.000044 | 0.000044 | 0.014576 | 0.004410 | 0.022059 | 0.001661 | 0.000044 | 0.000044 | 0.080110 | 0.154821 | 0.000728 | 0.062788 | 0.001983 | 0.061427 | 0.004070 | 0.000385 | 0.000884 | 0.001146 | 0.110842 | 0.000530 | 0.028977 | 0.125282 | 0.030545 | 0.000044 | 0.018969 | 0.113731 | 0.001935 | 0.000044 | 0.003563 |
| 644 | the haunted man and the ghosts bargain | stories | 0.013539 | 0.008731 | 0.000018 | 0.000755 | 0.338508 | 0.036795 | 0.000018 | 0.000018 | 0.000018 | 0.000018 | 0.000018 | 0.000018 | 0.000018 | 0.000018 | 0.000018 | 0.048639 | 0.000018 | 0.000018 | 0.008880 | 0.008994 | 0.000018 | 0.000018 | 0.039348 | 0.000018 | 0.000018 | 0.011931 | 0.001325 | 0.119953 | 0.000018 | 0.054100 | 0.002945 | 0.002512 | 0.002999 | 0.023194 | 0.024948 | 0.045053 | 0.199515 | 0.004034 | 0.002985 | 0.000018 |
| 650 | pictures from italy | non-fiction | 0.000049 | 0.000049 | 0.000049 | 0.000049 | 0.025876 | 0.048508 | 0.000049 | 0.018233 | 0.005075 | 0.000049 | 0.001030 | 0.000049 | 0.000049 | 0.015576 | 0.001485 | 0.006035 | 0.002129 | 0.000049 | 0.000049 | 0.000049 | 0.013290 | 0.005375 | 0.000049 | 0.003207 | 0.021150 | 0.062603 | 0.001672 | 0.000049 | 0.018010 | 0.025269 | 0.009037 | 0.211731 | 0.086059 | 0.005205 | 0.003540 | 0.404744 | 0.004048 | 0.000049 | 0.000049 | 0.000378 |
| 653 | the chimes | novel | 0.010035 | 0.012778 | 0.004614 | 0.003634 | 0.261995 | 0.035002 | 0.000023 | 0.000023 | 0.018819 | 0.000023 | 0.000770 | 0.000023 | 0.000023 | 0.007930 | 0.000023 | 0.057197 | 0.000023 | 0.001126 | 0.007654 | 0.035490 | 0.001012 | 0.000023 | 0.008587 | 0.001902 | 0.000023 | 0.009027 | 0.000023 | 0.024785 | 0.000023 | 0.065215 | 0.000023 | 0.032776 | 0.018716 | 0.137051 | 0.014520 | 0.014662 | 0.214359 | 0.000023 | 0.000023 | 0.000023 |
| 675 | american notes | non-fiction | 0.000032 | 0.004439 | 0.007907 | 0.000644 | 0.009644 | 0.144144 | 0.000395 | 0.016697 | 0.000032 | 0.003429 | 0.001726 | 0.052185 | 0.000183 | 0.011323 | 0.010785 | 0.001130 | 0.000032 | 0.000666 | 0.001681 | 0.000032 | 0.063589 | 0.000032 | 0.007802 | 0.028082 | 0.003566 | 0.099289 | 0.011625 | 0.000032 | 0.066451 | 0.034541 | 0.032365 | 0.133414 | 0.109447 | 0.015000 | 0.001785 | 0.083235 | 0.037067 | 0.000032 | 0.004933 | 0.000611 |
| 676 | the battle of life | novel | 0.032070 | 0.048985 | 0.002432 | 0.001528 | 0.027884 | 0.009867 | 0.000017 | 0.003545 | 0.000017 | 0.015602 | 0.004412 | 0.000017 | 0.000017 | 0.006410 | 0.038161 | 0.000947 | 0.006143 | 0.000017 | 0.000017 | 0.000017 | 0.000017 | 0.000017 | 0.006613 | 0.000017 | 0.003205 | 0.000017 | 0.000017 | 0.061075 | 0.000017 | 0.212033 | 0.001181 | 0.037772 | 0.009631 | 0.167399 | 0.009771 | 0.030190 | 0.249298 | 0.013594 | 0.000017 | 0.000017 |
| 699 | a childs history of england | non-fiction | 0.007595 | 0.003193 | 0.004027 | 0.003908 | 0.015744 | 0.010165 | 0.000279 | 0.086796 | 0.000063 | 0.000063 | 0.000850 | 0.006467 | 0.000063 | 0.002337 | 0.702304 | 0.002336 | 0.002245 | 0.000304 | 0.000063 | 0.000063 | 0.000517 | 0.000828 | 0.001765 | 0.009853 | 0.002335 | 0.001818 | 0.002271 | 0.005467 | 0.006591 | 0.023887 | 0.008268 | 0.013287 | 0.012881 | 0.000063 | 0.001967 | 0.033494 | 0.018338 | 0.005189 | 0.000847 | 0.001472 |
| 700 | the old curiosity shop | novel | 0.003832 | 0.008643 | 0.002243 | 0.004485 | 0.047036 | 0.011589 | 0.000059 | 0.001713 | 0.003850 | 0.009966 | 0.004901 | 0.001709 | 0.011828 | 0.005782 | 0.001818 | 0.025213 | 0.002009 | 0.000908 | 0.001618 | 0.001390 | 0.166047 | 0.000105 | 0.125930 | 0.008335 | 0.003628 | 0.010708 | 0.000160 | 0.020598 | 0.001445 | 0.109688 | 0.002808 | 0.067406 | 0.032958 | 0.154801 | 0.004360 | 0.014307 | 0.120166 | 0.003426 | 0.001906 | 0.000626 |
| 730 | oliver twist | novel | 0.004533 | 0.001820 | 0.000843 | 0.000063 | 0.110861 | 0.015866 | 0.000063 | 0.026520 | 0.002436 | 0.019079 | 0.015208 | 0.001572 | 0.002692 | 0.002161 | 0.006233 | 0.034464 | 0.020060 | 0.001144 | 0.009060 | 0.015587 | 0.086300 | 0.000275 | 0.317198 | 0.013840 | 0.005381 | 0.028280 | 0.000731 | 0.019728 | 0.002895 | 0.013626 | 0.001919 | 0.040055 | 0.013314 | 0.078532 | 0.006070 | 0.001133 | 0.074168 | 0.004743 | 0.001365 | 0.000181 |
| 766 | david copperfield | novel | 0.001296 | 0.016388 | 0.004283 | 0.000894 | 0.063482 | 0.026414 | 0.003275 | 0.006349 | 0.001869 | 0.014424 | 0.000682 | 0.004389 | 0.001149 | 0.018542 | 0.001611 | 0.076724 | 0.002864 | 0.000211 | 0.005459 | 0.013796 | 0.005361 | 0.000139 | 0.043218 | 0.003403 | 0.025111 | 0.025005 | 0.001259 | 0.194949 | 0.007341 | 0.305075 | 0.006192 | 0.026949 | 0.037878 | 0.015295 | 0.000650 | 0.011213 | 0.023023 | 0.000448 | 0.003227 | 0.000162 |
| 786 | hard times | novel | 0.017459 | 0.000072 | 0.000950 | 0.018537 | 0.174327 | 0.074307 | 0.000072 | 0.017165 | 0.001599 | 0.024379 | 0.000711 | 0.000072 | 0.002525 | 0.006388 | 0.005525 | 0.015128 | 0.002426 | 0.000072 | 0.014626 | 0.005060 | 0.008585 | 0.000072 | 0.019413 | 0.002483 | 0.008350 | 0.005876 | 0.017098 | 0.058787 | 0.003623 | 0.349753 | 0.005389 | 0.031536 | 0.004996 | 0.045274 | 0.001123 | 0.005689 | 0.046728 | 0.000072 | 0.000072 | 0.003681 |
| 807 | hunted down | stories | 0.000160 | 0.000160 | 0.006459 | 0.005958 | 0.143385 | 0.155136 | 0.000160 | 0.000160 | 0.031470 | 0.050362 | 0.000160 | 0.000160 | 0.000160 | 0.009417 | 0.005514 | 0.008811 | 0.006601 | 0.000160 | 0.000160 | 0.000160 | 0.000160 | 0.000160 | 0.000160 | 0.004341 | 0.056479 | 0.000160 | 0.000160 | 0.000160 | 0.031224 | 0.338052 | 0.000160 | 0.020561 | 0.048957 | 0.017259 | 0.000160 | 0.000160 | 0.056495 | 0.000160 | 0.000160 | 0.000160 |
| 809 | holiday romance | stories | 0.000057 | 0.305651 | 0.001860 | 0.001941 | 0.000057 | 0.000057 | 0.033960 | 0.093647 | 0.000057 | 0.091103 | 0.000449 | 0.029579 | 0.019487 | 0.009635 | 0.025126 | 0.048472 | 0.000057 | 0.000057 | 0.016341 | 0.000057 | 0.000057 | 0.000057 | 0.009814 | 0.036647 | 0.014616 | 0.000057 | 0.000057 | 0.006521 | 0.000057 | 0.128031 | 0.033137 | 0.000057 | 0.046351 | 0.000057 | 0.022668 | 0.000057 | 0.000057 | 0.022345 | 0.000057 | 0.001656 |
| 810 | george silvermans explanation | stories | 0.009921 | 0.047064 | 0.001910 | 0.006749 | 0.071651 | 0.143381 | 0.000476 | 0.000476 | 0.034864 | 0.000476 | 0.000476 | 0.000476 | 0.000476 | 0.000476 | 0.005450 | 0.049834 | 0.016374 | 0.000476 | 0.007152 | 0.000476 | 0.019153 | 0.000476 | 0.052804 | 0.010907 | 0.037490 | 0.064397 | 0.004533 | 0.053375 | 0.000476 | 0.185868 | 0.000476 | 0.025508 | 0.000476 | 0.000476 | 0.040390 | 0.020361 | 0.069795 | 0.008401 | 0.003932 | 0.002072 |
| 821 | dombey and sons | novel | 0.006023 | 0.000209 | 0.012255 | 0.000376 | 0.100640 | 0.013093 | 0.007131 | 0.009672 | 0.000035 | 0.010699 | 0.007681 | 0.005454 | 0.000860 | 0.027464 | 0.002128 | 0.081267 | 0.002218 | 0.000110 | 0.002291 | 0.003199 | 0.016423 | 0.000664 | 0.066473 | 0.001332 | 0.003342 | 0.005226 | 0.001725 | 0.097441 | 0.002636 | 0.344962 | 0.003170 | 0.019996 | 0.033533 | 0.030413 | 0.003435 | 0.019597 | 0.052359 | 0.002758 | 0.001123 | 0.000585 |
| 824 | speeches of charles dickens | non-fiction | 0.002244 | 0.007369 | 0.000677 | 0.000307 | 0.002318 | 0.553153 | 0.000606 | 0.002830 | 0.001534 | 0.000219 | 0.000574 | 0.010276 | 0.000219 | 0.005650 | 0.004476 | 0.003755 | 0.037717 | 0.001077 | 0.000484 | 0.000219 | 0.005985 | 0.000859 | 0.000219 | 0.005946 | 0.052224 | 0.008766 | 0.011176 | 0.002050 | 0.002387 | 0.059758 | 0.003242 | 0.004533 | 0.077365 | 0.000219 | 0.008516 | 0.037413 | 0.057518 | 0.013683 | 0.003555 | 0.008880 |
| 872 | reprinted pieces | stories | 0.002692 | 0.022891 | 0.000107 | 0.002791 | 0.021129 | 0.067194 | 0.002700 | 0.012149 | 0.000051 | 0.022769 | 0.012911 | 0.023300 | 0.000536 | 0.034609 | 0.047992 | 0.051746 | 0.049217 | 0.019090 | 0.003789 | 0.001271 | 0.016779 | 0.000740 | 0.001711 | 0.003315 | 0.032565 | 0.114024 | 0.016753 | 0.024148 | 0.019770 | 0.019749 | 0.026277 | 0.070134 | 0.091912 | 0.015095 | 0.010529 | 0.052190 | 0.021984 | 0.001946 | 0.007758 | 0.053686 |
| 882 | sketches by boz | stories | 0.000087 | 0.006586 | 0.013932 | 0.000180 | 0.016988 | 0.035101 | 0.000408 | 0.010867 | 0.004694 | 0.035424 | 0.004252 | 0.002375 | 0.000918 | 0.018194 | 0.002876 | 0.018035 | 0.003599 | 0.000387 | 0.003867 | 0.002217 | 0.076421 | 0.000409 | 0.097617 | 0.013800 | 0.010062 | 0.093997 | 0.000716 | 0.008572 | 0.001402 | 0.028628 | 0.007423 | 0.041160 | 0.389772 | 0.006807 | 0.002030 | 0.014159 | 0.013995 | 0.000551 | 0.005108 | 0.006384 |
| 883 | our mutual friend | novel | 0.004088 | 0.006129 | 0.000191 | 0.001634 | 0.138487 | 0.029852 | 0.000490 | 0.002597 | 0.007627 | 0.013047 | 0.002076 | 0.001798 | 0.009780 | 0.037427 | 0.004512 | 0.072142 | 0.019643 | 0.000571 | 0.006862 | 0.006491 | 0.002056 | 0.000376 | 0.046276 | 0.005640 | 0.009054 | 0.020626 | 0.001963 | 0.019689 | 0.002027 | 0.315395 | 0.025591 | 0.010310 | 0.023754 | 0.105206 | 0.010223 | 0.004507 | 0.015783 | 0.004442 | 0.008021 | 0.003619 |
| 888 | the lazy tour of two idle apprentices | stories | 0.000021 | 0.000021 | 0.000359 | 0.000021 | 0.137140 | 0.051788 | 0.000021 | 0.006801 | 0.023275 | 0.007542 | 0.001968 | 0.001973 | 0.000021 | 0.006908 | 0.009051 | 0.012922 | 0.005666 | 0.000021 | 0.000021 | 0.000021 | 0.043741 | 0.000021 | 0.035380 | 0.000021 | 0.020445 | 0.224965 | 0.021404 | 0.000021 | 0.070535 | 0.067169 | 0.000021 | 0.149138 | 0.048367 | 0.000021 | 0.000021 | 0.013515 | 0.032597 | 0.000021 | 0.005130 | 0.001910 |
| 912 | the mudfog and other sketches | stories | 0.003319 | 0.000290 | 0.037002 | 0.000062 | 0.000062 | 0.184076 | 0.000062 | 0.007906 | 0.005512 | 0.011126 | 0.000062 | 0.005354 | 0.000062 | 0.007623 | 0.003144 | 0.006610 | 0.000062 | 0.000062 | 0.005729 | 0.007128 | 0.055961 | 0.000472 | 0.064658 | 0.006094 | 0.008670 | 0.013262 | 0.020875 | 0.000062 | 0.006645 | 0.042761 | 0.032201 | 0.034813 | 0.381971 | 0.008938 | 0.000062 | 0.002080 | 0.021059 | 0.000062 | 0.003471 | 0.010628 |
| 914 | the uncommerical traveller | non-fiction | 0.002452 | 0.004144 | 0.004081 | 0.000912 | 0.044953 | 0.119505 | 0.003267 | 0.024083 | 0.011438 | 0.002019 | 0.003721 | 0.029011 | 0.002736 | 0.023526 | 0.014059 | 0.076547 | 0.012000 | 0.005797 | 0.002078 | 0.004513 | 0.003563 | 0.001697 | 0.013785 | 0.006227 | 0.008516 | 0.178545 | 0.033046 | 0.006223 | 0.009264 | 0.064368 | 0.014244 | 0.075122 | 0.108261 | 0.005675 | 0.012378 | 0.049931 | 0.007741 | 0.005130 | 0.003049 | 0.002393 |
| 916 | sketches of young couples | stories | 0.006602 | 0.075274 | 0.000124 | 0.000124 | 0.000124 | 0.080526 | 0.005364 | 0.000124 | 0.000124 | 0.013803 | 0.000124 | 0.000124 | 0.000124 | 0.138807 | 0.000124 | 0.010918 | 0.003723 | 0.000124 | 0.000124 | 0.003770 | 0.066943 | 0.000124 | 0.048686 | 0.004700 | 0.000124 | 0.000124 | 0.000124 | 0.033477 | 0.000124 | 0.140611 | 0.007810 | 0.016093 | 0.187859 | 0.023017 | 0.050191 | 0.009206 | 0.066474 | 0.000124 | 0.003911 | 0.000124 |
| 917 | barnaby rudge | stories | 0.001484 | 0.039080 | 0.002127 | 0.001687 | 0.124637 | 0.023546 | 0.000060 | 0.118934 | 0.002873 | 0.004162 | 0.000729 | 0.000188 | 0.000444 | 0.002984 | 0.007480 | 0.008124 | 0.001434 | 0.001082 | 0.001819 | 0.007535 | 0.071889 | 0.000090 | 0.088993 | 0.008522 | 0.000217 | 0.017280 | 0.001099 | 0.005461 | 0.001021 | 0.114286 | 0.002291 | 0.036449 | 0.016389 | 0.135217 | 0.012567 | 0.014178 | 0.114350 | 0.005469 | 0.002248 | 0.001574 |
| 918 | sketches of young gentlemen | stories | 0.004089 | 0.005804 | 0.000138 | 0.000138 | 0.013997 | 0.112845 | 0.001068 | 0.022076 | 0.000138 | 0.002182 | 0.000138 | 0.000957 | 0.000138 | 0.079010 | 0.000138 | 0.030502 | 0.000138 | 0.000138 | 0.000138 | 0.001091 | 0.018252 | 0.000138 | 0.099702 | 0.000138 | 0.002364 | 0.000138 | 0.000138 | 0.000138 | 0.000138 | 0.195908 | 0.004616 | 0.000138 | 0.347162 | 0.013838 | 0.000138 | 0.006504 | 0.012918 | 0.021564 | 0.000138 | 0.001059 |
| 922 | sunday under three heads | non-fiction | 0.000046 | 0.000046 | 0.000046 | 0.000046 | 0.000046 | 0.272929 | 0.000046 | 0.016210 | 0.000046 | 0.000046 | 0.021101 | 0.010027 | 0.008715 | 0.023691 | 0.024064 | 0.000046 | 0.000046 | 0.000046 | 0.000046 | 0.000046 | 0.030070 | 0.000046 | 0.000046 | 0.007468 | 0.000046 | 0.140669 | 0.000046 | 0.000046 | 0.000046 | 0.039152 | 0.002097 | 0.108734 | 0.182674 | 0.000046 | 0.007984 | 0.043120 | 0.048683 | 0.000046 | 0.004251 | 0.007398 |
| 927 | the lamplighter | stories | 0.000026 | 0.000026 | 0.009641 | 0.009697 | 0.025532 | 0.000026 | 0.002298 | 0.000026 | 0.000026 | 0.000026 | 0.002256 | 0.000026 | 0.000026 | 0.193202 | 0.048433 | 0.000026 | 0.000026 | 0.000026 | 0.098652 | 0.000026 | 0.048376 | 0.000026 | 0.061973 | 0.000026 | 0.000026 | 0.000026 | 0.000026 | 0.000026 | 0.000026 | 0.052617 | 0.000026 | 0.000026 | 0.106671 | 0.249931 | 0.000026 | 0.000026 | 0.000026 | 0.090052 | 0.000026 | 0.000026 |
| 967 | nicholas nickleby | novel | 0.002125 | 0.011280 | 0.010865 | 0.006124 | 0.048602 | 0.023867 | 0.005086 | 0.006421 | 0.004549 | 0.009090 | 0.001564 | 0.000357 | 0.005359 | 0.005791 | 0.008039 | 0.012111 | 0.000506 | 0.000491 | 0.003394 | 0.001049 | 0.043806 | 0.000212 | 0.345691 | 0.003005 | 0.010450 | 0.014124 | 0.000500 | 0.009573 | 0.000866 | 0.169686 | 0.000289 | 0.016760 | 0.063542 | 0.052850 | 0.003019 | 0.007265 | 0.083355 | 0.001900 | 0.005011 | 0.001425 |
| 968 | martin chuzzlewit | novel | 0.003355 | 0.001296 | 0.003595 | 0.004087 | 0.045424 | 0.053055 | 0.000799 | 0.003193 | 0.003010 | 0.016225 | 0.001257 | 0.007943 | 0.001183 | 0.004642 | 0.002101 | 0.031619 | 0.001580 | 0.000499 | 0.003474 | 0.011793 | 0.036561 | 0.000115 | 0.089186 | 0.004860 | 0.007091 | 0.004773 | 0.002650 | 0.008290 | 0.004483 | 0.289093 | 0.006735 | 0.037685 | 0.041666 | 0.119489 | 0.001854 | 0.011782 | 0.106202 | 0.002061 | 0.004976 | 0.020319 |
| 1023 | bleak house | novel | 0.004758 | 0.005565 | 0.001183 | 0.001195 | 0.063027 | 0.034179 | 0.001225 | 0.005106 | 0.002430 | 0.005056 | 0.000855 | 0.001782 | 0.000285 | 0.124690 | 0.005372 | 0.034434 | 0.010449 | 0.000336 | 0.003598 | 0.008935 | 0.012820 | 0.000812 | 0.005495 | 0.020472 | 0.008108 | 0.037943 | 0.000583 | 0.043751 | 0.002087 | 0.369145 | 0.000917 | 0.062103 | 0.026720 | 0.057080 | 0.001845 | 0.020585 | 0.010186 | 0.001675 | 0.002339 | 0.000876 |
| 1044 | extract from captain stormfields visit to Heaven | stories | 0.000026 | 0.000026 | 0.000026 | 0.003027 | 0.000026 | 0.038712 | 0.000026 | 0.010110 | 0.273283 | 0.010064 | 0.000026 | 0.004330 | 0.000026 | 0.021369 | 0.000026 | 0.036398 | 0.000026 | 0.000026 | 0.380510 | 0.000026 | 0.000026 | 0.000026 | 0.000026 | 0.000026 | 0.000026 | 0.000026 | 0.005046 | 0.020815 | 0.120613 | 0.000026 | 0.000026 | 0.000026 | 0.000026 | 0.000026 | 0.000026 | 0.058521 | 0.000026 | 0.012742 | 0.000026 | 0.003815 |
| 1086 | a horses tale | novel | 0.059202 | 0.006429 | 0.003931 | 0.000380 | 0.000380 | 0.014288 | 0.000380 | 0.124267 | 0.287565 | 0.001775 | 0.016005 | 0.000380 | 0.000380 | 0.059956 | 0.006656 | 0.000380 | 0.000380 | 0.000380 | 0.165845 | 0.003990 | 0.000380 | 0.000380 | 0.000380 | 0.015203 | 0.004769 | 0.000380 | 0.000380 | 0.056436 | 0.080230 | 0.008057 | 0.000380 | 0.000380 | 0.000380 | 0.002029 | 0.007926 | 0.010480 | 0.041512 | 0.004405 | 0.012586 | 0.000380 |
| 1289 | three ghost stories | stories | 0.000028 | 0.011209 | 0.000028 | 0.000028 | 0.356886 | 0.064719 | 0.000028 | 0.000028 | 0.002145 | 0.000028 | 0.000028 | 0.000028 | 0.000028 | 0.000028 | 0.000028 | 0.020180 | 0.000028 | 0.002139 | 0.000028 | 0.000028 | 0.000028 | 0.000028 | 0.007600 | 0.155852 | 0.000028 | 0.000028 | 0.061068 | 0.000028 | 0.000028 | 0.106354 | 0.000028 | 0.063453 | 0.114172 | 0.000028 | 0.000028 | 0.012380 | 0.009372 | 0.007583 | 0.004211 | 0.000028 |
| 1394 | the holly tree | stories | 0.000070 | 0.008868 | 0.001150 | 0.001172 | 0.005605 | 0.007670 | 0.000367 | 0.017607 | 0.000070 | 0.081669 | 0.013279 | 0.000070 | 0.002803 | 0.048426 | 0.033752 | 0.034844 | 0.003383 | 0.000070 | 0.019988 | 0.003643 | 0.003281 | 0.000070 | 0.000070 | 0.009350 | 0.002607 | 0.024561 | 0.015921 | 0.092577 | 0.016530 | 0.207301 | 0.002018 | 0.309378 | 0.017405 | 0.000070 | 0.000070 | 0.011054 | 0.000070 | 0.000070 | 0.003023 | 0.000070 |
| 1400 | great expectations | novel | 0.008051 | 0.010427 | 0.009295 | 0.000273 | 0.147545 | 0.016101 | 0.000141 | 0.016068 | 0.002444 | 0.034892 | 0.001620 | 0.001655 | 0.004322 | 0.002420 | 0.010844 | 0.135783 | 0.011776 | 0.002337 | 0.011440 | 0.012367 | 0.013224 | 0.000104 | 0.008377 | 0.020282 | 0.011404 | 0.029075 | 0.000074 | 0.023468 | 0.003401 | 0.298123 | 0.018875 | 0.033577 | 0.027675 | 0.042299 | 0.000823 | 0.007518 | 0.016436 | 0.000504 | 0.004597 | 0.000365 |
| 1406 | the perils of certain english prisoners | stories | 0.014194 | 0.000021 | 0.000021 | 0.000021 | 0.027778 | 0.000021 | 0.000021 | 0.203184 | 0.003034 | 0.012660 | 0.000021 | 0.026865 | 0.008659 | 0.008602 | 0.020588 | 0.041268 | 0.000021 | 0.000021 | 0.000021 | 0.000021 | 0.000021 | 0.000021 | 0.000021 | 0.000021 | 0.000021 | 0.000021 | 0.000021 | 0.115146 | 0.000021 | 0.146811 | 0.338734 | 0.000021 | 0.000021 | 0.000021 | 0.012749 | 0.019201 | 0.000021 | 0.000021 | 0.000021 | 0.000021 |
| 1407 | a message from the sea | stories | 0.000043 | 0.000043 | 0.000043 | 0.000043 | 0.050603 | 0.000043 | 0.000043 | 0.000043 | 0.000043 | 0.055973 | 0.005727 | 0.023967 | 0.000043 | 0.000043 | 0.000043 | 0.153868 | 0.006144 | 0.000043 | 0.012117 | 0.001977 | 0.000043 | 0.000043 | 0.000043 | 0.000043 | 0.086484 | 0.000043 | 0.000043 | 0.015735 | 0.037522 | 0.170046 | 0.019317 | 0.009532 | 0.000043 | 0.125547 | 0.000043 | 0.089674 | 0.125228 | 0.000043 | 0.004716 | 0.004918 |
| 1413 | tom tiddlers ground | stories | 0.035451 | 0.000092 | 0.000092 | 0.000092 | 0.000092 | 0.014516 | 0.002532 | 0.000092 | 0.000092 | 0.000092 | 0.000092 | 0.000092 | 0.000092 | 0.056255 | 0.000092 | 0.029080 | 0.000092 | 0.401024 | 0.000092 | 0.033424 | 0.000092 | 0.000092 | 0.000092 | 0.000092 | 0.049512 | 0.011152 | 0.000092 | 0.061790 | 0.000092 | 0.116580 | 0.012194 | 0.063932 | 0.042396 | 0.067860 | 0.000092 | 0.000092 | 0.000092 | 0.000092 | 0.000092 | 0.000092 |
| 1414 | somebodys luggage | stories | 0.000039 | 0.000039 | 0.000039 | 0.002410 | 0.047335 | 0.065210 | 0.000039 | 0.040716 | 0.000039 | 0.061912 | 0.009994 | 0.000039 | 0.000039 | 0.000039 | 0.009826 | 0.098158 | 0.000039 | 0.000039 | 0.000039 | 0.020193 | 0.000039 | 0.001258 | 0.036868 | 0.000039 | 0.152739 | 0.094292 | 0.008094 | 0.035274 | 0.000039 | 0.130270 | 0.006624 | 0.033222 | 0.026763 | 0.052395 | 0.000039 | 0.065699 | 0.000039 | 0.000039 | 0.000039 | 0.000039 |
| 1415 | doctor marigold | stories | 0.000243 | 0.000243 | 0.000243 | 0.000243 | 0.090357 | 0.000243 | 0.000243 | 0.000243 | 0.043937 | 0.000243 | 0.000243 | 0.000243 | 0.000243 | 0.012153 | 0.000243 | 0.038077 | 0.000243 | 0.167589 | 0.000243 | 0.000243 | 0.000243 | 0.000243 | 0.000243 | 0.000243 | 0.190915 | 0.000243 | 0.004866 | 0.045748 | 0.000243 | 0.074768 | 0.000243 | 0.084178 | 0.000243 | 0.000243 | 0.000243 | 0.000243 | 0.000243 | 0.000243 | 0.240352 | 0.000243 |
| 1416 | mrs lirripers lodgings | stories | 0.010001 | 0.125790 | 0.000063 | 0.000063 | 0.014245 | 0.001928 | 0.000063 | 0.000063 | 0.024774 | 0.027208 | 0.009422 | 0.000063 | 0.000063 | 0.038522 | 0.000063 | 0.150047 | 0.000063 | 0.000063 | 0.000063 | 0.000063 | 0.000063 | 0.000063 | 0.000063 | 0.000063 | 0.014796 | 0.022164 | 0.003146 | 0.062366 | 0.000063 | 0.279825 | 0.000063 | 0.038550 | 0.086227 | 0.081866 | 0.000063 | 0.000063 | 0.000063 | 0.000063 | 0.007741 | 0.000063 |
| 1421 | mrs lirripers legacy | stories | 0.033660 | 0.000070 | 0.000070 | 0.002316 | 0.015992 | 0.000070 | 0.000070 | 0.042129 | 0.008298 | 0.000070 | 0.000070 | 0.000070 | 0.000070 | 0.026659 | 0.056801 | 0.154682 | 0.000070 | 0.000070 | 0.010421 | 0.000070 | 0.000070 | 0.000070 | 0.000070 | 0.000070 | 0.045520 | 0.034743 | 0.015652 | 0.059030 | 0.000070 | 0.277510 | 0.000070 | 0.000070 | 0.037251 | 0.000070 | 0.000070 | 0.007933 | 0.169782 | 0.000070 | 0.000070 | 0.000070 |
| 1435 | miscellaneous papers | non-fiction | 0.011929 | 0.005109 | 0.000878 | 0.005464 | 0.018538 | 0.437036 | 0.002649 | 0.000110 | 0.000110 | 0.000110 | 0.000110 | 0.005096 | 0.015935 | 0.007046 | 0.033284 | 0.010773 | 0.000384 | 0.000911 | 0.000110 | 0.000110 | 0.020848 | 0.001432 | 0.000110 | 0.053288 | 0.069205 | 0.052118 | 0.001924 | 0.047632 | 0.000110 | 0.024136 | 0.000110 | 0.006275 | 0.093166 | 0.000110 | 0.001681 | 0.040028 | 0.017083 | 0.009560 | 0.002830 | 0.002663 |
| 1467 | some christmas stories | stories | 0.014210 | 0.039305 | 0.009365 | 0.000068 | 0.037513 | 0.032199 | 0.000876 | 0.005088 | 0.000068 | 0.000068 | 0.000068 | 0.000068 | 0.067357 | 0.030319 | 0.031980 | 0.035329 | 0.000068 | 0.013546 | 0.003468 | 0.000068 | 0.010016 | 0.000068 | 0.000068 | 0.000068 | 0.000068 | 0.032800 | 0.000068 | 0.074771 | 0.004318 | 0.166369 | 0.002310 | 0.138201 | 0.000068 | 0.000068 | 0.003571 | 0.108232 | 0.111641 | 0.019727 | 0.006471 | 0.000068 |
| 1837 | the prince and the pauper | novel | 0.125743 | 0.014046 | 0.002184 | 0.225778 | 0.082276 | 0.012667 | 0.000258 | 0.090723 | 0.100744 | 0.000116 | 0.003111 | 0.000926 | 0.001426 | 0.001213 | 0.016594 | 0.008136 | 0.004622 | 0.000960 | 0.003165 | 0.000233 | 0.014543 | 0.001045 | 0.018216 | 0.027053 | 0.008439 | 0.019571 | 0.000902 | 0.005416 | 0.018147 | 0.037613 | 0.001162 | 0.004094 | 0.010064 | 0.004237 | 0.008178 | 0.076497 | 0.023576 | 0.021492 | 0.004034 | 0.000799 |
| 2324 | a house to let | stories | 0.014811 | 0.000038 | 0.002889 | 0.001239 | 0.039083 | 0.025552 | 0.002840 | 0.000038 | 0.035412 | 0.033709 | 0.000038 | 0.000038 | 0.003289 | 0.015144 | 0.017534 | 0.179573 | 0.001026 | 0.000507 | 0.006870 | 0.022148 | 0.069638 | 0.000038 | 0.051005 | 0.000038 | 0.026245 | 0.011389 | 0.001323 | 0.100124 | 0.000038 | 0.204624 | 0.001314 | 0.010229 | 0.012098 | 0.000038 | 0.009929 | 0.000038 | 0.068250 | 0.025332 | 0.006500 | 0.000038 |
| 2874 | personal recollections of joan of arc vol 1 | non-fiction | 0.254703 | 0.003826 | 0.002709 | 0.008486 | 0.019684 | 0.004559 | 0.000108 | 0.163436 | 0.301137 | 0.000108 | 0.000247 | 0.000108 | 0.000967 | 0.006413 | 0.032959 | 0.004157 | 0.000231 | 0.000108 | 0.024249 | 0.000108 | 0.007636 | 0.010241 | 0.004606 | 0.006486 | 0.007070 | 0.002021 | 0.000940 | 0.005925 | 0.005511 | 0.017831 | 0.006245 | 0.009349 | 0.006495 | 0.000989 | 0.003908 | 0.030857 | 0.028394 | 0.016854 | 0.000108 | 0.000230 |
| 2875 | personal recollections of joan of arc vol 2 | non-fiction | 0.297637 | 0.001720 | 0.001515 | 0.004186 | 0.031298 | 0.032773 | 0.000318 | 0.166488 | 0.201224 | 0.000120 | 0.000120 | 0.000120 | 0.002494 | 0.002260 | 0.062038 | 0.000120 | 0.001026 | 0.000392 | 0.004382 | 0.000120 | 0.003747 | 0.002552 | 0.003638 | 0.058809 | 0.035605 | 0.002257 | 0.000120 | 0.001615 | 0.006843 | 0.011981 | 0.001675 | 0.003160 | 0.000120 | 0.000120 | 0.002293 | 0.024776 | 0.015619 | 0.010168 | 0.004431 | 0.000120 |
| 2895 | following the equator | non-fiction | 0.011044 | 0.001414 | 0.000457 | 0.001887 | 0.006131 | 0.089574 | 0.001073 | 0.028531 | 0.294687 | 0.001252 | 0.002843 | 0.029978 | 0.002753 | 0.017941 | 0.034808 | 0.005698 | 0.000164 | 0.000074 | 0.003895 | 0.000977 | 0.002475 | 0.001045 | 0.000074 | 0.010054 | 0.038305 | 0.015447 | 0.023789 | 0.004283 | 0.165327 | 0.003860 | 0.009761 | 0.015251 | 0.005023 | 0.004518 | 0.005769 | 0.108919 | 0.006070 | 0.001164 | 0.042595 | 0.001092 |
| 3171 | in defense of harriet shelley | non-fiction | 0.000034 | 0.012185 | 0.000034 | 0.012122 | 0.000034 | 0.085039 | 0.000034 | 0.000034 | 0.191222 | 0.000034 | 0.000034 | 0.000849 | 0.003289 | 0.070596 | 0.000034 | 0.006989 | 0.001492 | 0.000034 | 0.000034 | 0.000034 | 0.000034 | 0.000034 | 0.000034 | 0.070466 | 0.214361 | 0.000034 | 0.000034 | 0.185338 | 0.000034 | 0.039906 | 0.000034 | 0.000034 | 0.000034 | 0.000034 | 0.020583 | 0.000034 | 0.011135 | 0.073621 | 0.000034 | 0.000034 |
| 3172 | fenimore coopers literary offences | non-fiction | 0.000033 | 0.000033 | 0.000033 | 0.000033 | 0.000033 | 0.124599 | 0.000033 | 0.000033 | 0.316952 | 0.000033 | 0.000033 | 0.000033 | 0.000033 | 0.000033 | 0.000033 | 0.000033 | 0.000033 | 0.000033 | 0.043526 | 0.000033 | 0.000033 | 0.000033 | 0.009827 | 0.000033 | 0.157960 | 0.000033 | 0.000033 | 0.000033 | 0.191427 | 0.000033 | 0.069185 | 0.000033 | 0.000033 | 0.000033 | 0.000033 | 0.000033 | 0.000033 | 0.085464 | 0.000033 | 0.000033 |
| 3173 | essays on paul bourget | non-fiction | 0.000034 | 0.000034 | 0.000034 | 0.000034 | 0.000034 | 0.333928 | 0.000034 | 0.000034 | 0.358450 | 0.000034 | 0.000034 | 0.000034 | 0.139035 | 0.016437 | 0.020053 | 0.000034 | 0.000034 | 0.000034 | 0.000034 | 0.000034 | 0.000034 | 0.000034 | 0.000034 | 0.000034 | 0.100168 | 0.000034 | 0.000034 | 0.000034 | 0.013662 | 0.000034 | 0.000034 | 0.000034 | 0.000034 | 0.007288 | 0.000034 | 0.000034 | 0.000034 | 0.000034 | 0.009913 | 0.000034 |
| 3176 | the innocents abroad | non-fiction | 0.029505 | 0.002304 | 0.000949 | 0.006506 | 0.008949 | 0.026396 | 0.003114 | 0.014719 | 0.080961 | 0.003917 | 0.000942 | 0.049456 | 0.002922 | 0.008070 | 0.015862 | 0.005765 | 0.002381 | 0.000066 | 0.006395 | 0.000918 | 0.005015 | 0.001251 | 0.003263 | 0.004839 | 0.033706 | 0.015327 | 0.007128 | 0.002813 | 0.217566 | 0.012259 | 0.004339 | 0.007663 | 0.037549 | 0.003257 | 0.001281 | 0.324273 | 0.006877 | 0.009630 | 0.030135 | 0.001730 |
| 3177 | roughing it | novel | 0.012992 | 0.002765 | 0.014333 | 0.000394 | 0.019562 | 0.043561 | 0.000930 | 0.035867 | 0.095350 | 0.007011 | 0.003234 | 0.012701 | 0.001741 | 0.010103 | 0.023408 | 0.009128 | 0.001816 | 0.000089 | 0.033219 | 0.000089 | 0.017188 | 0.000125 | 0.008954 | 0.018542 | 0.022195 | 0.014141 | 0.009638 | 0.004959 | 0.355956 | 0.003465 | 0.010804 | 0.015089 | 0.017879 | 0.001614 | 0.008950 | 0.038038 | 0.007441 | 0.008863 | 0.106673 | 0.001196 |
| 3178 | the gilded age | novel | 0.025880 | 0.006614 | 0.005308 | 0.010801 | 0.022757 | 0.113402 | 0.000100 | 0.020267 | 0.142413 | 0.006764 | 0.015285 | 0.003583 | 0.002244 | 0.010394 | 0.006556 | 0.009220 | 0.002843 | 0.000100 | 0.053766 | 0.000683 | 0.020436 | 0.006935 | 0.009766 | 0.045014 | 0.078775 | 0.005416 | 0.027073 | 0.021289 | 0.047869 | 0.097384 | 0.006764 | 0.006939 | 0.019824 | 0.014720 | 0.002743 | 0.016925 | 0.035721 | 0.011539 | 0.056405 | 0.009480 |
| 3179 | the american claimant | novel | 0.026804 | 0.001621 | 0.000900 | 0.000354 | 0.025078 | 0.051622 | 0.003494 | 0.012931 | 0.425034 | 0.001172 | 0.002532 | 0.004486 | 0.000082 | 0.006056 | 0.006309 | 0.002132 | 0.001702 | 0.000082 | 0.098701 | 0.006431 | 0.003784 | 0.004996 | 0.004435 | 0.005611 | 0.043318 | 0.010739 | 0.012837 | 0.016985 | 0.022811 | 0.083305 | 0.000082 | 0.013987 | 0.003046 | 0.048169 | 0.000627 | 0.014003 | 0.014356 | 0.003770 | 0.014325 | 0.001290 |
| 3180 | a double barrelled detective story | stories | 0.060737 | 0.000187 | 0.011478 | 0.003985 | 0.087922 | 0.000187 | 0.001350 | 0.036353 | 0.198311 | 0.000187 | 0.000187 | 0.000187 | 0.000187 | 0.040224 | 0.023056 | 0.003893 | 0.000187 | 0.000187 | 0.180209 | 0.004277 | 0.000187 | 0.000187 | 0.015526 | 0.047129 | 0.110753 | 0.004326 | 0.015328 | 0.024609 | 0.079133 | 0.000187 | 0.000187 | 0.000187 | 0.000187 | 0.011706 | 0.000187 | 0.000187 | 0.011629 | 0.003920 | 0.020969 | 0.000187 |
| 3181 | the stolen white elephant | stories | 0.011084 | 0.000069 | 0.000069 | 0.000069 | 0.000069 | 0.095697 | 0.000069 | 0.093768 | 0.093206 | 0.000069 | 0.012880 | 0.000069 | 0.000069 | 0.000069 | 0.019576 | 0.000069 | 0.000069 | 0.332541 | 0.000069 | 0.000069 | 0.040520 | 0.000069 | 0.016949 | 0.016004 | 0.058283 | 0.000069 | 0.016890 | 0.000069 | 0.099416 | 0.000069 | 0.003488 | 0.000069 | 0.000069 | 0.000069 | 0.000069 | 0.000069 | 0.000069 | 0.016607 | 0.071435 | 0.000069 |
| 3182 | some rambling notes of an idle excursion | non-fiction | 0.010924 | 0.000043 | 0.000043 | 0.000595 | 0.029328 | 0.016605 | 0.000043 | 0.010001 | 0.209493 | 0.017555 | 0.004909 | 0.082340 | 0.000043 | 0.021463 | 0.000043 | 0.016702 | 0.000043 | 0.000043 | 0.115642 | 0.000043 | 0.040879 | 0.000043 | 0.000043 | 0.000043 | 0.012665 | 0.005565 | 0.000043 | 0.000043 | 0.235806 | 0.016294 | 0.005979 | 0.032909 | 0.000043 | 0.000043 | 0.000043 | 0.093898 | 0.000043 | 0.000043 | 0.019632 | 0.000043 |
| 3183 | the facts concerning the recent carnival of crime in connecticut | stories | 0.000027 | 0.000027 | 0.000027 | 0.000027 | 0.000027 | 0.000027 | 0.000027 | 0.000027 | 0.567401 | 0.000027 | 0.000027 | 0.000027 | 0.000027 | 0.000027 | 0.000027 | 0.054135 | 0.000027 | 0.000027 | 0.000027 | 0.000027 | 0.116567 | 0.000027 | 0.000027 | 0.016214 | 0.025588 | 0.000027 | 0.000027 | 0.078630 | 0.000027 | 0.000027 | 0.000027 | 0.000027 | 0.000027 | 0.022760 | 0.004409 | 0.000027 | 0.030977 | 0.067589 | 0.014946 | 0.000027 |
| 3184 | alonzo fitz and other stories | stories | 0.070844 | 0.035533 | 0.006579 | 0.076966 | 0.000100 | 0.055767 | 0.024276 | 0.016597 | 0.253554 | 0.005676 | 0.002321 | 0.003493 | 0.000100 | 0.018799 | 0.052588 | 0.020610 | 0.006932 | 0.000100 | 0.013157 | 0.000100 | 0.000100 | 0.000526 | 0.034607 | 0.004314 | 0.061596 | 0.000100 | 0.001323 | 0.035962 | 0.043148 | 0.010330 | 0.003511 | 0.033554 | 0.005483 | 0.011928 | 0.016344 | 0.012413 | 0.027123 | 0.006151 | 0.027293 | 0.000100 |
| 3185 | those extraordinary twins | stories | 0.051618 | 0.000121 | 0.000121 | 0.000121 | 0.006009 | 0.040372 | 0.000121 | 0.029985 | 0.336576 | 0.007004 | 0.000121 | 0.006005 | 0.000121 | 0.002539 | 0.015421 | 0.007450 | 0.000121 | 0.000121 | 0.076600 | 0.000121 | 0.013117 | 0.001470 | 0.030799 | 0.088584 | 0.018600 | 0.000121 | 0.000121 | 0.066925 | 0.000121 | 0.073847 | 0.002360 | 0.004898 | 0.054767 | 0.021661 | 0.026911 | 0.000121 | 0.014542 | 0.000121 | 0.000121 | 0.000121 |
| 3186 | the mysterious stranger and other stories | stories | 0.088795 | 0.002064 | 0.007931 | 0.000100 | 0.046225 | 0.009184 | 0.000409 | 0.012778 | 0.419846 | 0.000100 | 0.001181 | 0.001084 | 0.001255 | 0.000100 | 0.000100 | 0.016131 | 0.003312 | 0.000100 | 0.078091 | 0.000100 | 0.000100 | 0.000100 | 0.003497 | 0.017478 | 0.006819 | 0.000100 | 0.000100 | 0.015278 | 0.013833 | 0.002915 | 0.003705 | 0.007642 | 0.007134 | 0.000100 | 0.002626 | 0.051234 | 0.156392 | 0.000100 | 0.021858 | 0.000100 |
| 3188 | mark twain speeches | non-fiction | 0.017354 | 0.015212 | 0.001637 | 0.000908 | 0.001994 | 0.176339 | 0.035388 | 0.007604 | 0.166658 | 0.007215 | 0.006539 | 0.010326 | 0.041526 | 0.010034 | 0.014208 | 0.008711 | 0.003314 | 0.005992 | 0.027776 | 0.002020 | 0.009121 | 0.001813 | 0.004547 | 0.011482 | 0.203406 | 0.011931 | 0.032577 | 0.002125 | 0.021376 | 0.018252 | 0.003065 | 0.008615 | 0.027685 | 0.002282 | 0.011028 | 0.013487 | 0.013400 | 0.004700 | 0.036388 | 0.001964 |
| 3189 | sketches new and old | stories | 0.030225 | 0.014409 | 0.002662 | 0.003720 | 0.024096 | 0.081282 | 0.038779 | 0.004678 | 0.166661 | 0.016553 | 0.010344 | 0.012444 | 0.014498 | 0.008122 | 0.023482 | 0.016807 | 0.002727 | 0.000186 | 0.034487 | 0.004357 | 0.017239 | 0.029027 | 0.018080 | 0.032765 | 0.074420 | 0.006648 | 0.003738 | 0.010512 | 0.078410 | 0.020527 | 0.022550 | 0.001477 | 0.019163 | 0.006485 | 0.014937 | 0.019339 | 0.034631 | 0.026012 | 0.051619 | 0.001902 |
| 3190 | 1601 conversation as it was by the social fireside in the time of the tudors | stories | 0.020241 | 0.000091 | 0.000091 | 0.168709 | 0.000091 | 0.182268 | 0.000091 | 0.012281 | 0.052307 | 0.000091 | 0.000091 | 0.000091 | 0.000091 | 0.000091 | 0.038442 | 0.000091 | 0.000091 | 0.000091 | 0.016059 | 0.000091 | 0.011184 | 0.001813 | 0.000091 | 0.018512 | 0.338629 | 0.014689 | 0.000091 | 0.000091 | 0.007642 | 0.015055 | 0.000091 | 0.000091 | 0.050329 | 0.000091 | 0.000091 | 0.013607 | 0.000091 | 0.034799 | 0.000091 | 0.001436 |
| 3191 | goldsmiths friend abroad again | stories | 0.034448 | 0.000290 | 0.000290 | 0.020891 | 0.000290 | 0.016789 | 0.007057 | 0.096686 | 0.190280 | 0.024665 | 0.002496 | 0.071547 | 0.000290 | 0.026582 | 0.012434 | 0.000290 | 0.007255 | 0.000290 | 0.082032 | 0.000290 | 0.029389 | 0.000290 | 0.000290 | 0.133083 | 0.000290 | 0.068552 | 0.000290 | 0.000290 | 0.000290 | 0.000290 | 0.000290 | 0.000290 | 0.000290 | 0.000290 | 0.080194 | 0.000290 | 0.000290 | 0.000290 | 0.089230 | 0.000290 |
| 3192 | the curious republic of gondour and other whimsical sketches | stories | 0.002787 | 0.000366 | 0.000366 | 0.000366 | 0.000366 | 0.207245 | 0.004038 | 0.018737 | 0.110164 | 0.000366 | 0.007578 | 0.000366 | 0.039845 | 0.001026 | 0.062075 | 0.009014 | 0.000366 | 0.000366 | 0.049636 | 0.000366 | 0.019193 | 0.007298 | 0.013566 | 0.022362 | 0.141918 | 0.000366 | 0.024047 | 0.000366 | 0.046380 | 0.015176 | 0.001172 | 0.000366 | 0.037585 | 0.000366 | 0.016921 | 0.053327 | 0.017545 | 0.020181 | 0.044130 | 0.002296 |
| 3199 | the letters of mark twain | non-fiction | 0.010030 | 0.000719 | 0.000595 | 0.000965 | 0.004062 | 0.039612 | 0.002030 | 0.008451 | 0.100287 | 0.000032 | 0.002552 | 0.013279 | 0.000077 | 0.003288 | 0.007062 | 0.002565 | 0.000511 | 0.000032 | 0.009144 | 0.000133 | 0.000623 | 0.000415 | 0.000032 | 0.003425 | 0.604628 | 0.004145 | 0.004400 | 0.015627 | 0.048818 | 0.011539 | 0.005414 | 0.009026 | 0.017192 | 0.000501 | 0.005358 | 0.018648 | 0.012119 | 0.001311 | 0.031320 | 0.000032 |
| 3250 | how to tell a story and other essays | non-fiction | 0.000154 | 0.002918 | 0.023603 | 0.000154 | 0.046718 | 0.000154 | 0.000154 | 0.021177 | 0.273175 | 0.011341 | 0.003361 | 0.000154 | 0.000154 | 0.009182 | 0.000154 | 0.015302 | 0.000154 | 0.000154 | 0.061495 | 0.000154 | 0.000154 | 0.198694 | 0.000154 | 0.000154 | 0.252201 | 0.000154 | 0.051044 | 0.000154 | 0.005985 | 0.000154 | 0.000154 | 0.000154 | 0.000154 | 0.019958 | 0.000154 | 0.000154 | 0.000154 | 0.000154 | 0.000154 | 0.000154 |
| 3251 | the man that corrupted hadleyburg and other stories | stories | 0.106797 | 0.003092 | 0.001099 | 0.000571 | 0.024848 | 0.105268 | 0.001421 | 0.013612 | 0.322301 | 0.000063 | 0.005808 | 0.011495 | 0.014739 | 0.009264 | 0.012180 | 0.002858 | 0.003013 | 0.008216 | 0.013358 | 0.002110 | 0.002432 | 0.000389 | 0.004371 | 0.012769 | 0.043769 | 0.013054 | 0.006356 | 0.009824 | 0.044367 | 0.012564 | 0.017193 | 0.000854 | 0.009265 | 0.002897 | 0.011301 | 0.042568 | 0.046584 | 0.023297 | 0.031339 | 0.002691 |
| 19337 | a christmas carol | novel | 0.000037 | 0.025192 | 0.018158 | 0.004069 | 0.085491 | 0.000037 | 0.000037 | 0.000037 | 0.000037 | 0.013482 | 0.000037 | 0.002719 | 0.007778 | 0.001321 | 0.000037 | 0.096320 | 0.000037 | 0.000037 | 0.017769 | 0.000037 | 0.000037 | 0.000331 | 0.040343 | 0.001199 | 0.005437 | 0.036021 | 0.000037 | 0.000037 | 0.000037 | 0.003521 | 0.000037 | 0.033510 | 0.048031 | 0.098614 | 0.004814 | 0.033424 | 0.412742 | 0.000037 | 0.009084 | 0.000037 |
| 19484 | editorial wild oats | stories | 0.016739 | 0.021627 | 0.010532 | 0.000121 | 0.021929 | 0.033859 | 0.000121 | 0.027905 | 0.161669 | 0.056337 | 0.037636 | 0.000121 | 0.000121 | 0.010481 | 0.000121 | 0.018697 | 0.000121 | 0.000121 | 0.077391 | 0.000121 | 0.000121 | 0.000121 | 0.054033 | 0.007920 | 0.241402 | 0.000121 | 0.000121 | 0.004059 | 0.000121 | 0.000121 | 0.045244 | 0.008501 | 0.014117 | 0.000121 | 0.000121 | 0.039252 | 0.056616 | 0.000121 | 0.031885 | 0.000121 |
| 19987 | chapters from my autobiography | non-fiction | 0.015152 | 0.005424 | 0.004267 | 0.001678 | 0.014649 | 0.047126 | 0.034514 | 0.014155 | 0.338425 | 0.004017 | 0.002687 | 0.005858 | 0.029485 | 0.002525 | 0.011599 | 0.011817 | 0.004155 | 0.001678 | 0.008206 | 0.002433 | 0.010460 | 0.001951 | 0.005298 | 0.006158 | 0.148161 | 0.008003 | 0.029951 | 0.026940 | 0.037969 | 0.017201 | 0.005320 | 0.003811 | 0.022530 | 0.001678 | 0.002674 | 0.022020 | 0.021782 | 0.001678 | 0.064892 | 0.001678 |
| 20795 | the cricket on the hearth | novel | 0.003587 | 0.098341 | 0.000017 | 0.001369 | 0.051946 | 0.004145 | 0.000017 | 0.000017 | 0.000017 | 0.011578 | 0.000017 | 0.000017 | 0.000017 | 0.000017 | 0.000017 | 0.033662 | 0.000017 | 0.000017 | 0.002876 | 0.000017 | 0.000017 | 0.000017 | 0.024834 | 0.000017 | 0.000017 | 0.000017 | 0.000017 | 0.059481 | 0.000017 | 0.184527 | 0.002035 | 0.074338 | 0.026702 | 0.198719 | 0.065576 | 0.000017 | 0.155894 | 0.000017 | 0.000017 | 0.000017 |
| 27924 | mugby junction | stories | 0.007883 | 0.005528 | 0.015614 | 0.000027 | 0.186989 | 0.020384 | 0.000613 | 0.000027 | 0.013005 | 0.004712 | 0.004886 | 0.009028 | 0.000027 | 0.010150 | 0.023515 | 0.022364 | 0.000027 | 0.003922 | 0.029266 | 0.009913 | 0.041854 | 0.000027 | 0.019164 | 0.000741 | 0.018143 | 0.005397 | 0.146543 | 0.011581 | 0.008341 | 0.131564 | 0.007190 | 0.090121 | 0.014115 | 0.029567 | 0.006791 | 0.026868 | 0.060773 | 0.000027 | 0.013287 | 0.000027 |
| 33077 | the treaty with china its provisions explained | non-fiction | 0.000025 | 0.000025 | 0.000025 | 0.000025 | 0.000025 | 0.565564 | 0.000025 | 0.000025 | 0.139387 | 0.000025 | 0.000025 | 0.000025 | 0.000025 | 0.013583 | 0.092394 | 0.000025 | 0.015492 | 0.000025 | 0.000025 | 0.000025 | 0.000025 | 0.001409 | 0.000025 | 0.006396 | 0.016241 | 0.021771 | 0.000025 | 0.000025 | 0.072968 | 0.000025 | 0.000025 | 0.000025 | 0.000025 | 0.000025 | 0.000025 | 0.011115 | 0.000025 | 0.000025 | 0.042973 | 0.000025 |
| 35536 | the poems and verses of charles dickens | stories | 0.028606 | 0.016627 | 0.000251 | 0.033804 | 0.022100 | 0.078869 | 0.000251 | 0.000251 | 0.000251 | 0.000251 | 0.001142 | 0.007336 | 0.036706 | 0.026398 | 0.026339 | 0.000251 | 0.000251 | 0.000251 | 0.006963 | 0.012798 | 0.017928 | 0.008544 | 0.000251 | 0.000251 | 0.160810 | 0.008572 | 0.006206 | 0.005921 | 0.028451 | 0.000251 | 0.006467 | 0.011599 | 0.106189 | 0.039718 | 0.121656 | 0.056794 | 0.066755 | 0.037210 | 0.004360 | 0.012377 |
| 60900 | merry tales | stories | 0.041890 | 0.000041 | 0.001007 | 0.000921 | 0.060213 | 0.010121 | 0.018594 | 0.059427 | 0.300134 | 0.008799 | 0.000041 | 0.005328 | 0.000041 | 0.009297 | 0.011548 | 0.000041 | 0.000041 | 0.000041 | 0.146301 | 0.000041 | 0.000041 | 0.001830 | 0.023285 | 0.000041 | 0.054089 | 0.000041 | 0.039215 | 0.003046 | 0.053715 | 0.011402 | 0.001232 | 0.018469 | 0.009726 | 0.020370 | 0.004114 | 0.000041 | 0.007727 | 0.011255 | 0.066455 | 0.000041 |
| 61522 | the 1000000 bank note | stories | 0.000019 | 0.000019 | 0.000019 | 0.000344 | 0.006644 | 0.027626 | 0.020171 | 0.010639 | 0.216060 | 0.000019 | 0.002364 | 0.055133 | 0.000019 | 0.013335 | 0.000019 | 0.000019 | 0.000019 | 0.002163 | 0.023513 | 0.000019 | 0.000944 | 0.000019 | 0.009940 | 0.000019 | 0.083591 | 0.045708 | 0.026836 | 0.000019 | 0.027197 | 0.016926 | 0.005115 | 0.005671 | 0.015416 | 0.007363 | 0.000019 | 0.047304 | 0.010133 | 0.297055 | 0.022543 | 0.000019 |
| 62636 | to the person sitting in darkness | non-fiction | 0.059227 | 0.000036 | 0.000036 | 0.000036 | 0.000036 | 0.114464 | 0.000036 | 0.127221 | 0.356787 | 0.000036 | 0.000036 | 0.000036 | 0.235829 | 0.018680 | 0.000036 | 0.000036 | 0.000036 | 0.000036 | 0.000036 | 0.000036 | 0.000036 | 0.000036 | 0.000036 | 0.000036 | 0.000036 | 0.000036 | 0.000036 | 0.000036 | 0.000036 | 0.000036 | 0.000036 | 0.000036 | 0.000036 | 0.000036 | 0.000036 | 0.000036 | 0.000036 | 0.000036 | 0.086604 | 0.000036 |
| 62739 | king leopolds soliloquy | stories | 0.040921 | 0.000234 | 0.000234 | 0.014970 | 0.000234 | 0.277346 | 0.001842 | 0.053755 | 0.195237 | 0.000234 | 0.000234 | 0.000234 | 0.048687 | 0.026170 | 0.033429 | 0.000234 | 0.000234 | 0.000234 | 0.005593 | 0.000234 | 0.000234 | 0.000234 | 0.010849 | 0.060088 | 0.108774 | 0.000234 | 0.005016 | 0.000234 | 0.027254 | 0.000234 | 0.000234 | 0.000234 | 0.000234 | 0.015380 | 0.051293 | 0.000234 | 0.008413 | 0.000234 | 0.010068 | 0.000234 |
# most common topics by work type
book_mean_theta.groupby('type').mean().idxmax(axis = 1)
type non-fiction 8 novel 29 stories 8 dtype: int64
# table with most popular topic for each book --> rename new col created to topic_id and set index to topic_id for join below
max_topic = book_mean_theta.apply(lambda x: x.idxmax(), axis = 1).reset_index().rename({0: 'topic_id'}, axis = 1).set_index('topic_id')
# join with tm.TOPIC for words for each topic
max_topic = max_topic.join(tm.TOPIC).reset_index().set_index('book_id')
max_topic['top_five_terms'] = max_topic.apply(lambda x: x.top_terms_rel.split()[:5], axis = 1)
max_topic.sort_values('topic_id', ascending = False).drop('label', axis = 1).style.background_gradient(cmap='YlGnBu', subset = ['topic_id'])
| topic_id | title | type | phi_sum | theta_sum | h | top_terms_rel | top_terms | top_five_terms | |
|---|---|---|---|---|---|---|---|---|---|
| book_id | |||||||||
| 1415 | 38 | doctor marigold | stories | 17024.101091 | 45.659177 | 8.480000 | dollars cent wages cents per sold sell buy coal | dollars gold pay worth per sold silver cent government | ['dollars', 'cent', 'wages', 'cents', 'per'] |
| 61522 | 37 | the 1000000 bank note | stories | 12154.044627 | 17.790633 | 9.470000 | thy parents author thou thee madam graceful stars bosom | thy thee thou soul ye noble youth parents bear | ['thy', 'parents', 'author', 'thou', 'thee'] |
| 19337 | 36 | a christmas carol | novel | 45127.533127 | 82.742349 | 9.750000 | merry charity sisters sorrow mercy brothers nephew ghost younger | spirit merry bear brothers tears earth thoughts truth bright | ['merry', 'charity', 'sisters', 'sorrow', 'mercy'] |
| 676 | 36 | the battle of life | novel | 45127.533127 | 82.742349 | 9.750000 | merry charity sisters sorrow mercy brothers nephew ghost younger | spirit merry bear brothers tears earth thoughts truth bright | ['merry', 'charity', 'sisters', 'sorrow', 'mercy'] |
| 3176 | 35 | the innocents abroad | non-fiction | 41938.713871 | 85.799296 | 9.610000 | marble centuries pictures ancient picturesque palace painted walls stone | stone picture walls ancient sea houses streets sun pictures | ['marble', 'centuries', 'pictures', 'ancient', 'picturesque'] |
| 650 | 35 | pictures from italy | non-fiction | 41938.713871 | 85.799296 | 9.610000 | marble centuries pictures ancient picturesque palace painted walls stone | stone picture walls ancient sea houses streets sun pictures | ['marble', 'centuries', 'pictures', 'ancient', 'picturesque'] |
| 20795 | 33 | the cricket on the hearth | novel | 41170.236899 | 69.274817 | 9.540000 | ha ant eh youre hes jolly em havent retorted | ha youre hes em youll whats office eh pleasant | ['ha', 'ant', 'eh', 'youre', 'hes'] |
| 927 | 33 | the lamplighter | stories | 41170.236899 | 69.274817 | 9.540000 | ha ant eh youre hes jolly em havent retorted | ha youre hes em youll whats office eh pleasant | ['ha', 'ant', 'eh', 'youre', 'hes'] |
| 917 | 33 | barnaby rudge | stories | 41170.236899 | 69.274817 | 9.540000 | ha ant eh youre hes jolly em havent retorted | ha youre hes em youll whats office eh pleasant | ['ha', 'ant', 'eh', 'youre', 'hes'] |
| 912 | 32 | the mudfog and other sketches | stories | 50898.403590 | 97.576213 | 9.850000 | theatre audience dancing ball stout applause circle punch gallery | party everybody wine glass stage appearance blue theatre oclock | ['theatre', 'audience', 'dancing', 'ball', 'stout'] |
| 882 | 32 | sketches by boz | stories | 50898.403590 | 97.576213 | 9.850000 | theatre audience dancing ball stout applause circle punch gallery | party everybody wine glass stage appearance blue theatre oclock | ['theatre', 'audience', 'dancing', 'ball', 'stout'] |
| 916 | 32 | sketches of young couples | stories | 50898.403590 | 97.576213 | 9.850000 | theatre audience dancing ball stout applause circle punch gallery | party everybody wine glass stage appearance blue theatre oclock | ['theatre', 'audience', 'dancing', 'ball', 'stout'] |
| 918 | 32 | sketches of young gentlemen | stories | 50898.403590 | 97.576213 | 9.850000 | theatre audience dancing ball stout applause circle punch gallery | party everybody wine glass stage appearance blue theatre oclock | ['theatre', 'audience', 'dancing', 'ball', 'stout'] |
| 1394 | 31 | the holly tree | stories | 34759.348239 | 56.914861 | 9.450000 | travelling wheels horses landlord lamps roads road carriage cart | road horses horse carriage wind weather trees journey green | ['travelling', 'wheels', 'horses', 'landlord', 'lamps'] |
| 1406 | 30 | the perils of certain english prisoners | stories | 10004.087665 | 21.173252 | 8.080000 | boat boats tide island lion shore ashore steam stream | boat boats island shore bank tide lion stream ashore | ['boat', 'boats', 'tide', 'island', 'lion'] |
| 883 | 29 | our mutual friend | novel | 138476.137464 | 220.112446 | 10.220000 | guardian cousin assure sister confidence pursued dearest madam agreeable | sister understand observed guardian confidence daughter honour thank please | ['guardian', 'cousin', 'assure', 'sister', 'confidence'] |
| 564 | 29 | the mystery of edwin drood | novel | 138476.137464 | 220.112446 | 10.220000 | guardian cousin assure sister confidence pursued dearest madam agreeable | sister understand observed guardian confidence daughter honour thank please | ['guardian', 'cousin', 'assure', 'sister', 'confidence'] |
| 766 | 29 | david copperfield | novel | 138476.137464 | 220.112446 | 10.220000 | guardian cousin assure sister confidence pursued dearest madam agreeable | sister understand observed guardian confidence daughter honour thank please | ['guardian', 'cousin', 'assure', 'sister', 'confidence'] |
| 786 | 29 | hard times | novel | 138476.137464 | 220.112446 | 10.220000 | guardian cousin assure sister confidence pursued dearest madam agreeable | sister understand observed guardian confidence daughter honour thank please | ['guardian', 'cousin', 'assure', 'sister', 'confidence'] |
| 807 | 29 | hunted down | stories | 138476.137464 | 220.112446 | 10.220000 | guardian cousin assure sister confidence pursued dearest madam agreeable | sister understand observed guardian confidence daughter honour thank please | ['guardian', 'cousin', 'assure', 'sister', 'confidence'] |
| 810 | 29 | george silvermans explanation | stories | 138476.137464 | 220.112446 | 10.220000 | guardian cousin assure sister confidence pursued dearest madam agreeable | sister understand observed guardian confidence daughter honour thank please | ['guardian', 'cousin', 'assure', 'sister', 'confidence'] |
| 821 | 29 | dombey and sons | novel | 138476.137464 | 220.112446 | 10.220000 | guardian cousin assure sister confidence pursued dearest madam agreeable | sister understand observed guardian confidence daughter honour thank please | ['guardian', 'cousin', 'assure', 'sister', 'confidence'] |
| 968 | 29 | martin chuzzlewit | novel | 138476.137464 | 220.112446 | 10.220000 | guardian cousin assure sister confidence pursued dearest madam agreeable | sister understand observed guardian confidence daughter honour thank please | ['guardian', 'cousin', 'assure', 'sister', 'confidence'] |
| 1400 | 29 | great expectations | novel | 138476.137464 | 220.112446 | 10.220000 | guardian cousin assure sister confidence pursued dearest madam agreeable | sister understand observed guardian confidence daughter honour thank please | ['guardian', 'cousin', 'assure', 'sister', 'confidence'] |
| 1407 | 29 | a message from the sea | stories | 138476.137464 | 220.112446 | 10.220000 | guardian cousin assure sister confidence pursued dearest madam agreeable | sister understand observed guardian confidence daughter honour thank please | ['guardian', 'cousin', 'assure', 'sister', 'confidence'] |
| 1416 | 29 | mrs lirripers lodgings | stories | 138476.137464 | 220.112446 | 10.220000 | guardian cousin assure sister confidence pursued dearest madam agreeable | sister understand observed guardian confidence daughter honour thank please | ['guardian', 'cousin', 'assure', 'sister', 'confidence'] |
| 1421 | 29 | mrs lirripers legacy | stories | 138476.137464 | 220.112446 | 10.220000 | guardian cousin assure sister confidence pursued dearest madam agreeable | sister understand observed guardian confidence daughter honour thank please | ['guardian', 'cousin', 'assure', 'sister', 'confidence'] |
| 1467 | 29 | some christmas stories | stories | 138476.137464 | 220.112446 | 10.220000 | guardian cousin assure sister confidence pursued dearest madam agreeable | sister understand observed guardian confidence daughter honour thank please | ['guardian', 'cousin', 'assure', 'sister', 'confidence'] |
| 2324 | 29 | a house to let | stories | 138476.137464 | 220.112446 | 10.220000 | guardian cousin assure sister confidence pursued dearest madam agreeable | sister understand observed guardian confidence daughter honour thank please | ['guardian', 'cousin', 'assure', 'sister', 'confidence'] |
| 1023 | 29 | bleak house | novel | 138476.137464 | 220.112446 | 10.220000 | guardian cousin assure sister confidence pursued dearest madam agreeable | sister understand observed guardian confidence daughter honour thank please | ['guardian', 'cousin', 'assure', 'sister', 'confidence'] |
| 119 | 28 | a tramp abroad | non-fiction | 48624.881840 | 118.187148 | 9.790000 | lake mountain valley mountains ice rock miles forest snow | miles mountain land lake rock mountains distance snow ice | ['lake', 'mountain', 'valley', 'mountains', 'ice'] |
| 3182 | 28 | some rambling notes of an idle excursion | non-fiction | 48624.881840 | 118.187148 | 9.790000 | lake mountain valley mountains ice rock miles forest snow | miles mountain land lake rock mountains distance snow ice | ['lake', 'mountain', 'valley', 'mountains', 'ice'] |
| 3177 | 28 | roughing it | novel | 48624.881840 | 118.187148 | 9.790000 | lake mountain valley mountains ice rock miles forest snow | miles mountain land lake rock mountains distance snow ice | ['lake', 'mountain', 'valley', 'mountains', 'ice'] |
| 245 | 28 | life on the mississippi | non-fiction | 48624.881840 | 118.187148 | 9.790000 | lake mountain valley mountains ice rock miles forest snow | miles mountain land lake rock mountains distance snow ice | ['lake', 'mountain', 'valley', 'mountains', 'ice'] |
| 888 | 25 | the lazy tour of two idle apprentices | stories | 26769.288410 | 48.638219 | 9.400000 | waiter shops idle police dirty market plate houses shillings | houses streets shop idle waiter iron windows women yard | ['waiter', 'shops', 'idle', 'police', 'dirty'] |
| 872 | 25 | reprinted pieces | stories | 26769.288410 | 48.638219 | 9.400000 | waiter shops idle police dirty market plate houses shillings | houses streets shop idle waiter iron windows women yard | ['waiter', 'shops', 'idle', 'police', 'dirty'] |
| 914 | 25 | the uncommerical traveller | non-fiction | 26769.288410 | 48.638219 | 9.400000 | waiter shops idle police dirty market plate houses shillings | houses streets shop idle waiter iron windows women yard | ['waiter', 'shops', 'idle', 'police', 'dirty'] |
| 35536 | 24 | the poems and verses of charles dickens | stories | 50656.218241 | 105.019908 | 9.730000 | lecture 3 wrote literary 2 author letters machine print | wrote letters write written story paper writing send books | ['lecture', '3', 'wrote', 'literary', '2'] |
| 19484 | 24 | editorial wild oats | stories | 50656.218241 | 105.019908 | 9.730000 | lecture 3 wrote literary 2 author letters machine print | wrote letters write written story paper writing send books | ['lecture', '3', 'wrote', 'literary', '2'] |
| 3199 | 24 | the letters of mark twain | non-fiction | 50656.218241 | 105.019908 | 9.730000 | lecture 3 wrote literary 2 author letters machine print | wrote letters write written story paper writing send books | ['lecture', '3', 'wrote', 'literary', '2'] |
| 3190 | 24 | 1601 conversation as it was by the social fireside in the time of the tudors | stories | 50656.218241 | 105.019908 | 9.730000 | lecture 3 wrote literary 2 author letters machine print | wrote letters write written story paper writing send books | ['lecture', '3', 'wrote', 'literary', '2'] |
| 3188 | 24 | mark twain speeches | non-fiction | 50656.218241 | 105.019908 | 9.730000 | lecture 3 wrote literary 2 author letters machine print | wrote letters write written story paper writing send books | ['lecture', '3', 'wrote', 'literary', '2'] |
| 3171 | 24 | in defense of harriet shelley | non-fiction | 50656.218241 | 105.019908 | 9.730000 | lecture 3 wrote literary 2 author letters machine print | wrote letters write written story paper writing send books | ['lecture', '3', 'wrote', 'literary', '2'] |
| 1414 | 24 | somebodys luggage | stories | 50656.218241 | 105.019908 | 9.730000 | lecture 3 wrote literary 2 author letters machine print | wrote letters write written story paper writing send books | ['lecture', '3', 'wrote', 'literary', '2'] |
| 967 | 22 | nicholas nickleby | novel | 72011.366796 | 111.941402 | 10.070000 | rejoined inquired interposed hastily gentlemans exclaimed indignation countenance thrust | inquired rejoined exclaimed countenance servant minutes turning reply certainly | ['rejoined', 'inquired', 'interposed', 'hastily', 'gentlemans'] |
| 580 | 22 | the pickwick papers | novel | 72011.366796 | 111.941402 | 10.070000 | rejoined inquired interposed hastily gentlemans exclaimed indignation countenance thrust | inquired rejoined exclaimed countenance servant minutes turning reply certainly | ['rejoined', 'inquired', 'interposed', 'hastily', 'gentlemans'] |
| 730 | 22 | oliver twist | novel | 72011.366796 | 111.941402 | 10.070000 | rejoined inquired interposed hastily gentlemans exclaimed indignation countenance thrust | inquired rejoined exclaimed countenance servant minutes turning reply certainly | ['rejoined', 'inquired', 'interposed', 'hastily', 'gentlemans'] |
| 700 | 20 | the old curiosity shop | novel | 29544.283556 | 55.121146 | 9.670000 | dwarf grandfather beneath childs sleeping dreary anxiety roused poverty | dwarf grandfather strange sleep silence led noise beneath died | ['dwarf', 'grandfather', 'beneath', 'childs', 'sleeping'] |
| 588 | 20 | master humphreys clock | stories | 29544.283556 | 55.121146 | 9.670000 | dwarf grandfather beneath childs sleeping dreary anxiety roused poverty | dwarf grandfather strange sleep silence led noise beneath died | ['dwarf', 'grandfather', 'beneath', 'childs', 'sleeping'] |
| 1044 | 18 | extract from captain stormfields visit to Heaven | stories | 34475.634392 | 101.499568 | 9.250000 | reckon warnt nigger bet maybe theyre anyway judged hed | warnt reckon hes minute hadnt everybody big maybe anybody | ['reckon', 'warnt', 'nigger', 'bet', 'maybe'] |
| 93 | 18 | tom sawyer detective | novel | 34475.634392 | 101.499568 | 9.250000 | reckon warnt nigger bet maybe theyre anyway judged hed | warnt reckon hes minute hadnt everybody big maybe anybody | ['reckon', 'warnt', 'nigger', 'bet', 'maybe'] |
| 91 | 18 | tom sawyer abroad | novel | 34475.634392 | 101.499568 | 9.250000 | reckon warnt nigger bet maybe theyre anyway judged hed | warnt reckon hes minute hadnt everybody big maybe anybody | ['reckon', 'warnt', 'nigger', 'bet', 'maybe'] |
| 76 | 18 | the adventures of huckleberry finn | novel | 34475.634392 | 101.499568 | 9.250000 | reckon warnt nigger bet maybe theyre anyway judged hed | warnt reckon hes minute hadnt everybody big maybe anybody | ['reckon', 'warnt', 'nigger', 'bet', 'maybe'] |
| 74 | 18 | the adventures of tom sawyer | novel | 34475.634392 | 101.499568 | 9.250000 | reckon warnt nigger bet maybe theyre anyway judged hed | warnt reckon hes minute hadnt everybody big maybe anybody | ['reckon', 'warnt', 'nigger', 'bet', 'maybe'] |
| 3181 | 17 | the stolen white elephant | stories | 1823.899489 | 4.987247 | 8.420000 | traveller pound meal eat landlord gate paid ruined shillings | traveller pound eat gate paid meal landlord office fifteen | ['traveller', 'pound', 'meal', 'eat', 'landlord'] |
| 1413 | 17 | tom tiddlers ground | stories | 1823.899489 | 4.987247 | 8.420000 | traveller pound meal eat landlord gate paid ruined shillings | traveller pound eat gate paid meal landlord office fifteen | ['traveller', 'pound', 'meal', 'eat', 'landlord'] |
| 699 | 14 | a childs history of england | non-fiction | 28395.221946 | 56.375642 | 9.460000 | council lords army french castle fought killed religion battle | french sent army died war castle killed afterwards court | ['council', 'lords', 'army', 'french', 'castle'] |
| 1086 | 8 | a horses tale | novel | 89131.260172 | 217.495613 | 9.980000 | detail color doesnt details recognized honor rule nation afterward | presently toward ones isnt everybody war able chance merely | ['detail', 'color', 'doesnt', 'details', 'recognized'] |
| 3173 | 8 | essays on paul bourget | non-fiction | 89131.260172 | 217.495613 | 9.980000 | detail color doesnt details recognized honor rule nation afterward | presently toward ones isnt everybody war able chance merely | ['detail', 'color', 'doesnt', 'details', 'recognized'] |
| 3172 | 8 | fenimore coopers literary offences | non-fiction | 89131.260172 | 217.495613 | 9.980000 | detail color doesnt details recognized honor rule nation afterward | presently toward ones isnt everybody war able chance merely | ['detail', 'color', 'doesnt', 'details', 'recognized'] |
| 2895 | 8 | following the equator | non-fiction | 89131.260172 | 217.495613 | 9.980000 | detail color doesnt details recognized honor rule nation afterward | presently toward ones isnt everybody war able chance merely | ['detail', 'color', 'doesnt', 'details', 'recognized'] |
| 2874 | 8 | personal recollections of joan of arc vol 1 | non-fiction | 89131.260172 | 217.495613 | 9.980000 | detail color doesnt details recognized honor rule nation afterward | presently toward ones isnt everybody war able chance merely | ['detail', 'color', 'doesnt', 'details', 'recognized'] |
| 3178 | 8 | the gilded age | novel | 89131.260172 | 217.495613 | 9.980000 | detail color doesnt details recognized honor rule nation afterward | presently toward ones isnt everybody war able chance merely | ['detail', 'color', 'doesnt', 'details', 'recognized'] |
| 142 | 8 | the 30000 bequest and other stories | stories | 89131.260172 | 217.495613 | 9.980000 | detail color doesnt details recognized honor rule nation afterward | presently toward ones isnt everybody war able chance merely | ['detail', 'color', 'doesnt', 'details', 'recognized'] |
| 102 | 8 | the tragedy of puddnhead wilson | novel | 89131.260172 | 217.495613 | 9.980000 | detail color doesnt details recognized honor rule nation afterward | presently toward ones isnt everybody war able chance merely | ['detail', 'color', 'doesnt', 'details', 'recognized'] |
| 86 | 8 | a connecticut yankee in king arthurs court | novel | 89131.260172 | 217.495613 | 9.980000 | detail color doesnt details recognized honor rule nation afterward | presently toward ones isnt everybody war able chance merely | ['detail', 'color', 'doesnt', 'details', 'recognized'] |
| 3180 | 8 | a double barrelled detective story | stories | 89131.260172 | 217.495613 | 9.980000 | detail color doesnt details recognized honor rule nation afterward | presently toward ones isnt everybody war able chance merely | ['detail', 'color', 'doesnt', 'details', 'recognized'] |
| 70 | 8 | what is man | non-fiction | 89131.260172 | 217.495613 | 9.980000 | detail color doesnt details recognized honor rule nation afterward | presently toward ones isnt everybody war able chance merely | ['detail', 'color', 'doesnt', 'details', 'recognized'] |
| 3179 | 8 | the american claimant | novel | 89131.260172 | 217.495613 | 9.980000 | detail color doesnt details recognized honor rule nation afterward | presently toward ones isnt everybody war able chance merely | ['detail', 'color', 'doesnt', 'details', 'recognized'] |
| 3183 | 8 | the facts concerning the recent carnival of crime in connecticut | stories | 89131.260172 | 217.495613 | 9.980000 | detail color doesnt details recognized honor rule nation afterward | presently toward ones isnt everybody war able chance merely | ['detail', 'color', 'doesnt', 'details', 'recognized'] |
| 3184 | 8 | alonzo fitz and other stories | stories | 89131.260172 | 217.495613 | 9.980000 | detail color doesnt details recognized honor rule nation afterward | presently toward ones isnt everybody war able chance merely | ['detail', 'color', 'doesnt', 'details', 'recognized'] |
| 3185 | 8 | those extraordinary twins | stories | 89131.260172 | 217.495613 | 9.980000 | detail color doesnt details recognized honor rule nation afterward | presently toward ones isnt everybody war able chance merely | ['detail', 'color', 'doesnt', 'details', 'recognized'] |
| 3186 | 8 | the mysterious stranger and other stories | stories | 89131.260172 | 217.495613 | 9.980000 | detail color doesnt details recognized honor rule nation afterward | presently toward ones isnt everybody war able chance merely | ['detail', 'color', 'doesnt', 'details', 'recognized'] |
| 3189 | 8 | sketches new and old | stories | 89131.260172 | 217.495613 | 9.980000 | detail color doesnt details recognized honor rule nation afterward | presently toward ones isnt everybody war able chance merely | ['detail', 'color', 'doesnt', 'details', 'recognized'] |
| 3191 | 8 | goldsmiths friend abroad again | stories | 89131.260172 | 217.495613 | 9.980000 | detail color doesnt details recognized honor rule nation afterward | presently toward ones isnt everybody war able chance merely | ['detail', 'color', 'doesnt', 'details', 'recognized'] |
| 3250 | 8 | how to tell a story and other essays | non-fiction | 89131.260172 | 217.495613 | 9.980000 | detail color doesnt details recognized honor rule nation afterward | presently toward ones isnt everybody war able chance merely | ['detail', 'color', 'doesnt', 'details', 'recognized'] |
| 3251 | 8 | the man that corrupted hadleyburg and other stories | stories | 89131.260172 | 217.495613 | 9.980000 | detail color doesnt details recognized honor rule nation afterward | presently toward ones isnt everybody war able chance merely | ['detail', 'color', 'doesnt', 'details', 'recognized'] |
| 19987 | 8 | chapters from my autobiography | non-fiction | 89131.260172 | 217.495613 | 9.980000 | detail color doesnt details recognized honor rule nation afterward | presently toward ones isnt everybody war able chance merely | ['detail', 'color', 'doesnt', 'details', 'recognized'] |
| 60900 | 8 | merry tales | stories | 89131.260172 | 217.495613 | 9.980000 | detail color doesnt details recognized honor rule nation afterward | presently toward ones isnt everybody war able chance merely | ['detail', 'color', 'doesnt', 'details', 'recognized'] |
| 62636 | 8 | to the person sitting in darkness | non-fiction | 89131.260172 | 217.495613 | 9.980000 | detail color doesnt details recognized honor rule nation afterward | presently toward ones isnt everybody war able chance merely | ['detail', 'color', 'doesnt', 'details', 'recognized'] |
| 1435 | 5 | miscellaneous papers | non-fiction | 56343.692496 | 150.168832 | 9.780000 | institution science political class national social association education legal | society human law character knowledge class christian institution members | ['institution', 'science', 'political', 'class', 'national'] |
| 675 | 5 | american notes | non-fiction | 56343.692496 | 150.168832 | 9.780000 | institution science political class national social association education legal | society human law character knowledge class christian institution members | ['institution', 'science', 'political', 'class', 'national'] |
| 824 | 5 | speeches of charles dickens | non-fiction | 56343.692496 | 150.168832 | 9.780000 | institution science political class national social association education legal | society human law character knowledge class christian institution members | ['institution', 'science', 'political', 'class', 'national'] |
| 922 | 5 | sunday under three heads | non-fiction | 56343.692496 | 150.168832 | 9.780000 | institution science political class national social association education legal | society human law character knowledge class christian institution members | ['institution', 'science', 'political', 'class', 'national'] |
| 62739 | 5 | king leopolds soliloquy | stories | 56343.692496 | 150.168832 | 9.780000 | institution science political class national social association education legal | society human law character knowledge class christian institution members | ['institution', 'science', 'political', 'class', 'national'] |
| 3192 | 5 | the curious republic of gondour and other whimsical sketches | stories | 56343.692496 | 150.168832 | 9.780000 | institution science political class national social association education legal | society human law character knowledge class christian institution members | ['institution', 'science', 'political', 'class', 'national'] |
| 33077 | 5 | the treaty with china its provisions explained | non-fiction | 56343.692496 | 150.168832 | 9.780000 | institution science political class national social association education legal | society human law character knowledge class christian institution members | ['institution', 'science', 'political', 'class', 'national'] |
| 27924 | 4 | mugby junction | stories | 62850.031662 | 113.004098 | 9.810000 | lock alarm lamp darkness dread horror muttered lighted nearer | figure slowly wind answered sound lips breast spoke opened | ['lock', 'alarm', 'lamp', 'darkness', 'dread'] |
| 1289 | 4 | three ghost stories | stories | 62850.031662 | 113.004098 | 9.810000 | lock alarm lamp darkness dread horror muttered lighted nearer | figure slowly wind answered sound lips breast spoke opened | ['lock', 'alarm', 'lamp', 'darkness', 'dread'] |
| 653 | 4 | the chimes | novel | 62850.031662 | 113.004098 | 9.810000 | lock alarm lamp darkness dread horror muttered lighted nearer | figure slowly wind answered sound lips breast spoke opened | ['lock', 'alarm', 'lamp', 'darkness', 'dread'] |
| 644 | 4 | the haunted man and the ghosts bargain | stories | 62850.031662 | 113.004098 | 9.810000 | lock alarm lamp darkness dread horror muttered lighted nearer | figure slowly wind answered sound lips breast spoke opened | ['lock', 'alarm', 'lamp', 'darkness', 'dread'] |
| 98 | 4 | a tale of two cities | novel | 62850.031662 | 113.004098 | 9.810000 | lock alarm lamp darkness dread horror muttered lighted nearer | figure slowly wind answered sound lips breast spoke opened | ['lock', 'alarm', 'lamp', 'darkness', 'dread'] |
| 1837 | 3 | the prince and the pauper | novel | 6787.624918 | 17.416021 | 7.430000 | thee thou ye thy lad punch knights rags mad | ye thee thou thy lad none art mad ah | ['thee', 'thou', 'ye', 'thy', 'lad'] |
| 809 | 1 | holiday romance | stories | 8929.895643 | 20.770954 | 8.580000 | locksmith school baby dance girls dancing parents hearty fish | school locksmith baby pocket girls laugh daughter married dance | ['locksmith', 'school', 'baby', 'dance', 'girls'] |
| 2875 | 0 | personal recollections of joan of arc vol 2 | non-fiction | 20027.842032 | 55.893150 | 8.940000 | maid count lie forever voices grace hearts begged message | lie ah none toward tears truth noble saying voices | ['maid', 'count', 'lie', 'forever', 'voices'] |
# set option so that columns not truncated
pd.set_option('display.max_colwidth', None)
works_df = max_topic.groupby('topic_id').agg({'topic_id': 'size', 'title': lambda x: ', '.join(x)}) \
.rename({'topic_id': 'count'}, axis = 1) \
.sort_values('count', ascending = False)
works_df['top_terms_rel'] = tm.TOPIC.top_terms_rel
works_df.reset_index().style.background_gradient(cmap='YlGnBu', subset = ['topic_id'])
| topic_id | count | title | top_terms_rel | |
|---|---|---|---|---|
| 0 | 8 | 23 | what is man, a connecticut yankee in king arthurs court, the tragedy of puddnhead wilson, the 30000 bequest and other stories, a horses tale, personal recollections of joan of arc vol 1, following the equator, fenimore coopers literary offences, essays on paul bourget, the gilded age, the american claimant, a double barrelled detective story, the facts concerning the recent carnival of crime in connecticut, alonzo fitz and other stories, those extraordinary twins, the mysterious stranger and other stories, sketches new and old, goldsmiths friend abroad again, how to tell a story and other essays, the man that corrupted hadleyburg and other stories, chapters from my autobiography, merry tales, to the person sitting in darkness | detail color doesnt details recognized honor rule nation afterward |
| 1 | 29 | 15 | the mystery of edwin drood, david copperfield, hard times, hunted down, george silvermans explanation, dombey and sons, our mutual friend, martin chuzzlewit, bleak house, great expectations, a message from the sea, mrs lirripers lodgings, mrs lirripers legacy, some christmas stories, a house to let | guardian cousin assure sister confidence pursued dearest madam agreeable |
| 2 | 24 | 7 | somebodys luggage, in defense of harriet shelley, mark twain speeches, 1601 conversation as it was by the social fireside in the time of the tudors, the letters of mark twain, editorial wild oats, the poems and verses of charles dickens | lecture 3 wrote literary 2 author letters machine print |
| 3 | 5 | 7 | american notes, speeches of charles dickens, sunday under three heads, miscellaneous papers, the curious republic of gondour and other whimsical sketches, the treaty with china its provisions explained, king leopolds soliloquy | institution science political class national social association education legal |
| 4 | 4 | 5 | a tale of two cities, the haunted man and the ghosts bargain, the chimes, three ghost stories, mugby junction | lock alarm lamp darkness dread horror muttered lighted nearer |
| 5 | 18 | 5 | the adventures of tom sawyer, the adventures of huckleberry finn, tom sawyer abroad, tom sawyer detective, extract from captain stormfields visit to Heaven | reckon warnt nigger bet maybe theyre anyway judged hed |
| 6 | 32 | 4 | sketches by boz, the mudfog and other sketches, sketches of young couples, sketches of young gentlemen | theatre audience dancing ball stout applause circle punch gallery |
| 7 | 28 | 4 | a tramp abroad, life on the mississippi, roughing it, some rambling notes of an idle excursion | lake mountain valley mountains ice rock miles forest snow |
| 8 | 22 | 3 | the pickwick papers, oliver twist, nicholas nickleby | rejoined inquired interposed hastily gentlemans exclaimed indignation countenance thrust |
| 9 | 25 | 3 | reprinted pieces, the lazy tour of two idle apprentices, the uncommerical traveller | waiter shops idle police dirty market plate houses shillings |
| 10 | 33 | 3 | barnaby rudge, the lamplighter, the cricket on the hearth | ha ant eh youre hes jolly em havent retorted |
| 11 | 36 | 2 | the battle of life, a christmas carol | merry charity sisters sorrow mercy brothers nephew ghost younger |
| 12 | 17 | 2 | tom tiddlers ground, the stolen white elephant | traveller pound meal eat landlord gate paid ruined shillings |
| 13 | 20 | 2 | master humphreys clock, the old curiosity shop | dwarf grandfather beneath childs sleeping dreary anxiety roused poverty |
| 14 | 35 | 2 | pictures from italy, the innocents abroad | marble centuries pictures ancient picturesque palace painted walls stone |
| 15 | 37 | 1 | the 1000000 bank note | thy parents author thou thee madam graceful stars bosom |
| 16 | 0 | 1 | personal recollections of joan of arc vol 2 | maid count lie forever voices grace hearts begged message |
| 17 | 31 | 1 | the holly tree | travelling wheels horses landlord lamps roads road carriage cart |
| 18 | 30 | 1 | the perils of certain english prisoners | boat boats tide island lion shore ashore steam stream |
| 19 | 1 | 1 | holiday romance | locksmith school baby dance girls dancing parents hearty fish |
| 20 | 14 | 1 | a childs history of england | council lords army french castle fought killed religion battle |
| 21 | 3 | 1 | the prince and the pauper | thee thou ye thy lad punch knights rags mad |
| 22 | 38 | 1 | doctor marigold | dollars cent wages cents per sold sell buy coal |
# reset width to default: https://pandas.pydata.org/docs/user_guide/options.html
pd.set_option('display.max_colwidth', 50)
w2v_params = dict(
min_count = 10,
workers = 1,
# vector_size = 246,
vector_size = 100,
window = 2
)
SENTS = CORPUS.groupby(OHCO[:-1]).term_str.apply(lambda x: x.tolist())
model = word2vec.Word2Vec(SENTS.values, **w2v_params)
W2V = pd.DataFrame(model.wv.get_normed_vectors(), index=model.wv.index_to_key)
W2V.index.name = 'term_str'
W2V = W2V.sort_index()
W2V.head()
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| term_str | |||||||||||||||||||||
| 0 | -0.168197 | 0.018952 | 0.025413 | 0.005438 | 0.097372 | -0.171935 | 0.085254 | 0.057859 | -0.150509 | -0.093497 | ... | 0.073231 | 0.070049 | -0.036947 | -0.052856 | 0.247726 | 0.087628 | 0.000105 | 0.004341 | 0.164670 | -0.054086 |
| 04 | -0.031947 | 0.000754 | -0.015000 | -0.020203 | 0.085838 | -0.088574 | 0.087527 | 0.199322 | -0.058843 | -0.068946 | ... | 0.102047 | 0.040068 | -0.028820 | 0.031878 | 0.044234 | 0.170845 | -0.055325 | -0.013117 | 0.063548 | -0.030138 |
| 08 | -0.052396 | 0.006009 | 0.034031 | 0.008314 | 0.056115 | -0.126131 | 0.092966 | 0.208461 | -0.063152 | -0.104492 | ... | 0.103707 | 0.013087 | -0.016439 | 0.035906 | 0.070789 | 0.204443 | -0.030207 | -0.026992 | 0.089753 | -0.042653 |
| 1 | -0.120617 | -0.009931 | -0.044832 | 0.057515 | 0.133221 | -0.062641 | 0.090187 | -0.013538 | -0.049871 | -0.038900 | ... | 0.019499 | -0.035111 | -0.023071 | 0.050238 | 0.220351 | 0.021539 | -0.061820 | -0.054285 | 0.136289 | -0.031151 |
| 10 | -0.064771 | 0.053249 | -0.015686 | 0.066639 | 0.094703 | -0.102791 | 0.043700 | 0.041097 | -0.103950 | -0.107940 | ... | 0.109693 | -0.019120 | -0.108438 | -0.060508 | 0.025758 | 0.079756 | 0.032051 | -0.039341 | 0.081146 | -0.007586 |
5 rows × 100 columns
tsne_params = dict(
learning_rate = 200., #'auto' or [10.0, 1000.0]
perplexity = 40,
n_components = 2,
init = 'random', # 'pca'
n_iter = 2500,
random_state = 23
)
tsne_engine = TSNE(**tsne_params)
tsne_model = tsne_engine.fit_transform(W2V)
COORDS = pd.DataFrame(tsne_model, columns=['x','y'], index=W2V.index).join(VOCAB, how='left')[['x','y','n','dfidf','pos_group']]
COORDS['log_n'] = np.log(COORDS['n'])
COORDS
| x | y | n | dfidf | pos_group | log_n | |
|---|---|---|---|---|---|---|
| term_str | ||||||
| 0 | -0.171599 | 61.323936 | 65 | 65.289055 | CD | 4.174387 |
| 04 | 0.542611 | 5.074944 | 10 | 20.322264 | NN | 2.302585 |
| 08 | 0.813468 | 5.348079 | 10 | 11.161132 | NN | 2.302585 |
| 1 | 0.145392 | 62.795574 | 369 | 640.100772 | CD | 5.910797 |
| 10 | 0.960314 | 64.455231 | 143 | 390.523832 | CD | 4.962845 |
| ... | ... | ... | ... | ... | ... | ... |
| zoological | -21.948164 | 17.195534 | 18 | 120.252374 | JJ | 2.890372 |
| zu | 60.205723 | -7.450152 | 22 | 28.728508 | NN | 3.091042 |
| zulu | -13.550667 | 38.664394 | 12 | 58.476439 | NN | 2.484907 |
| à | 74.929085 | 4.147956 | 94 | 125.841724 | NN | 4.543295 |
| était | 61.743019 | -4.612803 | 13 | 11.161132 | NN | 2.564949 |
22373 rows × 6 columns
px.scatter(COORDS.reset_index().sample(1000),
'x', 'y',
text='term_str',
color='pos_group',
hover_name='term_str',
size='dfidf',
height=1000).update_traces(
mode='markers+text',
textfont=dict(color='black', size=14, family='Arial'),
textposition='top center')
px.scatter(COORDS.reset_index().sort_values('dfidf', ascending=False).head(1000),
'x', 'y',
text='term_str',
color='pos_group',
hover_name='term_str',
size='dfidf',
height=1000).update_traces(
mode='markers+text',
textfont=dict(color='black', size=14, family='Arial'),
textposition='top center')
noun_COORDS = COORDS.loc[COORDS.pos_group == 'NN']
noun_COORDS
| x | y | n | dfidf | pos_group | log_n | |
|---|---|---|---|---|---|---|
| term_str | ||||||
| 04 | 0.542611 | 5.074944 | 10 | 20.322264 | NN | 2.302585 |
| 08 | 0.813468 | 5.348079 | 10 | 11.161132 | NN | 2.302585 |
| 350 | -83.536514 | 36.456417 | 24 | 78.392038 | NN | 3.178054 |
| 87 | 2.034327 | 5.669988 | 14 | 51.457016 | NN | 2.639057 |
| 89 | 1.161920 | 5.738420 | 15 | 44.196019 | NN | 2.708050 |
| ... | ... | ... | ... | ... | ... | ... |
| zone | -32.028049 | 2.642661 | 12 | 78.392038 | NN | 2.484907 |
| zu | 60.205723 | -7.450152 | 22 | 28.728508 | NN | 3.091042 |
| zulu | -13.550667 | 38.664394 | 12 | 58.476439 | NN | 2.484907 |
| à | 74.929085 | 4.147956 | 94 | 125.841724 | NN | 4.543295 |
| était | 61.743019 | -4.612803 | 13 | 11.161132 | NN | 2.564949 |
13143 rows × 6 columns
px.scatter(noun_COORDS.reset_index().sample(1000),
'x', 'y',
text='term_str',
color='pos_group',
hover_name='term_str',
size = 'log_n',
height=1000).update_traces(
mode='markers+text',
textfont=dict(color='black', size=14, family='Arial'),
textposition='top center')
def complete_analogy(A, B, C, n=2):
try:
cols = ['term', 'sim']
return pd.DataFrame(model.wv.most_similar(positive=[B, C], negative=[A])[0:n], columns=cols)
except KeyError as e:
print('Error:', e)
return None
def get_most_similar(positive, negative=None):
return pd.DataFrame(model.wv.most_similar(positive, negative), columns=['term', 'sim'])
complete_analogy('man', 'boy', 'woman', 3)
| term | sim | |
|---|---|---|
| 0 | girl | 0.851387 |
| 1 | baby | 0.768748 |
| 2 | child | 0.764450 |
complete_analogy('girl', 'daughter', 'boy', 3)
| term | sim | |
|---|---|---|
| 0 | son | 0.806970 |
| 1 | niece | 0.796292 |
| 2 | nephew | 0.792255 |
complete_analogy('girl', 'sister', 'boy', 3)
| term | sim | |
|---|---|---|
| 0 | niece | 0.803975 |
| 1 | nephew | 0.780578 |
| 2 | brother | 0.761206 |
complete_analogy('man', 'gentleman', 'woman', 5)
| term | sim | |
|---|---|---|
| 0 | lady | 0.837602 |
| 1 | girl | 0.756685 |
| 2 | housekeeper | 0.730277 |
| 3 | widow | 0.726901 |
| 4 | matron | 0.669566 |
complete_analogy('woman', 'lady', 'man', 5)
| term | sim | |
|---|---|---|
| 0 | gentleman | 0.824193 |
| 1 | person | 0.687966 |
| 2 | student | 0.618382 |
| 3 | clergyman | 0.597353 |
| 4 | lawyer | 0.588978 |
complete_analogy('day', 'sun', 'night', 5)
| term | sim | |
|---|---|---|
| 0 | moon | 0.758980 |
| 1 | rain | 0.719536 |
| 2 | sky | 0.714280 |
| 3 | sunlight | 0.712312 |
| 4 | clouds | 0.711592 |
complete_analogy('king', 'money', 'servant', 5)
| term | sim | |
|---|---|---|
| 0 | purse | 0.618348 |
| 1 | lodgings | 0.538427 |
| 2 | meals | 0.536840 |
| 3 | medicine | 0.535820 |
| 4 | property | 0.535807 |
complete_analogy('king', 'royal', 'servant', 5)
| term | sim | |
|---|---|---|
| 0 | keepers | 0.603850 |
| 1 | cabinet | 0.594091 |
| 2 | boarding | 0.564780 |
| 3 | private | 0.564711 |
| 4 | ladyships | 0.561722 |
complete_analogy('king', 'rich', 'servant', 5)
| term | sim | |
|---|---|---|
| 0 | nice | 0.608984 |
| 1 | shabby | 0.604531 |
| 2 | handsome | 0.594732 |
| 3 | clever | 0.577550 |
| 4 | sturdy | 0.547286 |
complete_analogy('lord', 'rich', 'servant', 5)
| term | sim | |
|---|---|---|
| 0 | shabby | 0.655889 |
| 1 | lazy | 0.580029 |
| 2 | tall | 0.578269 |
| 3 | clad | 0.566695 |
| 4 | sailor | 0.561728 |
complete_analogy('man', 'journey', 'woman', 5)
| term | sim | |
|---|---|---|
| 0 | voyage | 0.678766 |
| 1 | trip | 0.639491 |
| 2 | pilgrimage | 0.582225 |
| 3 | visit | 0.535242 |
| 4 | marriage | 0.534133 |
complete_analogy('woman', 'marriage', 'man', 5)
| term | sim | |
|---|---|---|
| 0 | commission | 0.621347 |
| 1 | trial | 0.615661 |
| 2 | introduction | 0.601732 |
| 3 | petition | 0.596050 |
| 4 | request | 0.594801 |
complete_analogy('man', 'property', 'woman', 5)
| term | sim | |
|---|---|---|
| 0 | affairs | 0.570672 |
| 1 | estate | 0.568760 |
| 2 | religion | 0.567143 |
| 3 | society | 0.564692 |
| 4 | rights | 0.558945 |
complete_analogy('man', 'fool', 'woman', 5)
| term | sim | |
|---|---|---|
| 0 | devil | 0.663740 |
| 1 | villain | 0.663168 |
| 2 | girl | 0.662057 |
| 3 | creetur | 0.660398 |
| 4 | beggar | 0.653608 |
complete_analogy('woman', 'fool', 'man', 5)
| term | sim | |
|---|---|---|
| 0 | vagabond | 0.561235 |
| 1 | thief | 0.558269 |
| 2 | foreigner | 0.557487 |
| 3 | devil | 0.557404 |
| 4 | villain | 0.556923 |
complete_analogy('man', 'wise', 'woman', 5)
| term | sim | |
|---|---|---|
| 0 | brave | 0.642014 |
| 1 | innocent | 0.607824 |
| 2 | clever | 0.591082 |
| 3 | foolish | 0.579697 |
| 4 | minded | 0.575066 |
complete_analogy('woman', 'wise', 'man', 5)
| term | sim | |
|---|---|---|
| 0 | reasonable | 0.554601 |
| 1 | sane | 0.533982 |
| 2 | superior | 0.519525 |
| 3 | useful | 0.519105 |
| 4 | rational | 0.509219 |
get_most_similar('joy')
| term | sim | |
|---|---|---|
| 0 | delight | 0.778237 |
| 1 | grief | 0.755719 |
| 2 | terror | 0.754897 |
| 3 | gratitude | 0.746084 |
| 4 | admiration | 0.743671 |
| 5 | gladness | 0.729121 |
| 6 | bitterness | 0.727012 |
| 7 | horror | 0.711439 |
| 8 | earnestness | 0.698146 |
| 9 | rage | 0.697935 |
get_most_similar('man')
| term | sim | |
|---|---|---|
| 0 | gentleman | 0.824611 |
| 1 | person | 0.804918 |
| 2 | woman | 0.780459 |
| 3 | student | 0.718592 |
| 4 | foreigner | 0.698749 |
| 5 | dog | 0.664404 |
| 6 | creature | 0.659028 |
| 7 | boy | 0.656878 |
| 8 | chap | 0.649437 |
| 9 | soldier | 0.648450 |
get_most_similar(positive=['man'], negative=['woman'])
| term | sim | |
|---|---|---|
| 0 | diplomatic | 0.262227 |
| 1 | mark | 0.260356 |
| 2 | line | 0.251799 |
| 3 | men | 0.246355 |
| 4 | point | 0.236271 |
| 5 | express | 0.231483 |
| 6 | transact | 0.229944 |
| 7 | patent | 0.225700 |
| 8 | further | 0.225159 |
| 9 | record | 0.224411 |
get_most_similar(positive='woman')
| term | sim | |
|---|---|---|
| 0 | girl | 0.859669 |
| 1 | man | 0.780459 |
| 2 | creature | 0.777335 |
| 3 | lady | 0.759753 |
| 4 | boy | 0.735077 |
| 5 | wretch | 0.728275 |
| 6 | gentleman | 0.722586 |
| 7 | rascal | 0.721813 |
| 8 | chap | 0.712864 |
| 9 | widow | 0.702231 |
get_most_similar(positive=['woman'], negative=['man'])
| term | sim | |
|---|---|---|
| 0 | jane | 0.476249 |
| 1 | sweet | 0.452006 |
| 2 | peasant | 0.427259 |
| 3 | weeping | 0.420330 |
| 4 | mary | 0.416504 |
| 5 | baby | 0.409885 |
| 6 | girl | 0.401163 |
| 7 | eldest | 0.397838 |
| 8 | buxom | 0.397097 |
| 9 | sally | 0.395690 |
get_most_similar(['man','woman'],['boy','girl'])
| term | sim | |
|---|---|---|
| 0 | gentleman | 0.331652 |
| 1 | outward | 0.317353 |
| 2 | moral | 0.316811 |
| 3 | person | 0.288236 |
| 4 | material | 0.287824 |
| 5 | himself | 0.286915 |
| 6 | crime | 0.286068 |
| 7 | sane | 0.280468 |
| 8 | indifference | 0.270962 |
| 9 | prosperous | 0.270836 |
get_most_similar('knowledge')
| term | sim | |
|---|---|---|
| 0 | experience | 0.746146 |
| 1 | theory | 0.725863 |
| 2 | ideas | 0.721126 |
| 3 | power | 0.716801 |
| 4 | wisdom | 0.712457 |
| 5 | imagination | 0.711024 |
| 6 | design | 0.709023 |
| 7 | recollection | 0.708704 |
| 8 | belief | 0.706114 |
| 9 | profession | 0.706039 |
get_most_similar('kindness')
| term | sim | |
|---|---|---|
| 0 | gratitude | 0.717582 |
| 1 | devotion | 0.710808 |
| 2 | homage | 0.709736 |
| 3 | condescension | 0.696773 |
| 4 | fortitude | 0.696501 |
| 5 | generosity | 0.696357 |
| 6 | friendship | 0.696182 |
| 7 | fidelity | 0.695630 |
| 8 | forgiveness | 0.695235 |
| 9 | affection | 0.692774 |
get_most_similar('adventure')
| term | sim | |
|---|---|---|
| 0 | event | 0.794039 |
| 1 | episode | 0.781311 |
| 2 | engagement | 0.779032 |
| 3 | interview | 0.772411 |
| 4 | incident | 0.764948 |
| 5 | anecdote | 0.752455 |
| 6 | exposition | 0.752424 |
| 7 | absurdity | 0.750883 |
| 8 | enterprise | 0.747795 |
| 9 | performance | 0.743935 |
get_most_similar('poor')
| term | sim | |
|---|---|---|
| 0 | miserable | 0.638907 |
| 1 | wretched | 0.638791 |
| 2 | wicked | 0.615074 |
| 3 | sick | 0.604765 |
| 4 | foolish | 0.601449 |
| 5 | friendless | 0.590747 |
| 6 | silly | 0.584538 |
| 7 | peasant | 0.579586 |
| 8 | brave | 0.578708 |
| 9 | darling | 0.575868 |
get_most_similar('money')
| term | sim | |
|---|---|---|
| 0 | trouble | 0.681588 |
| 1 | food | 0.657160 |
| 2 | debt | 0.618269 |
| 3 | purchase | 0.603006 |
| 4 | property | 0.599279 |
| 5 | wages | 0.586387 |
| 6 | reward | 0.582792 |
| 7 | security | 0.576184 |
| 8 | bill | 0.565190 |
| 9 | employment | 0.563871 |
get_most_similar('rich')
| term | sim | |
|---|---|---|
| 0 | healthy | 0.670826 |
| 1 | clever | 0.637012 |
| 2 | picturesque | 0.627724 |
| 3 | thirsty | 0.616304 |
| 4 | tough | 0.599806 |
| 5 | pure | 0.598837 |
| 6 | colored | 0.591493 |
| 7 | hungry | 0.588078 |
| 8 | prim | 0.587944 |
| 9 | showy | 0.585651 |
# W2V.to_csv(f'{data_home}/{data_prefix}/{data_prefix}-W2V.csv')
# VOCAB.to_csv(f'{data_home}/{data_prefix}/{data_prefix}-VOCAB.csv')
# SENTS.to_csv(f'{data_home}/{data_prefix}/{data_prefix}-GENSIM_DOCS.csv')
drop and loc: https://www.geeksforgeeks.org/how-to-drop-one-or-multiple-columns-in-pandas-dataframe/